diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index 9d1825067a661..6d578580a13db 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -81,7 +81,7 @@ jobs:
     with:
       name: Build e2e tests
       runner: '["Linux", "build"]'
-      image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest
+      image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:alldeps
       image_options: -u 1001
       ref: ${{ github.sha }}
       merge_ref: ''
diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml
index 96c691451965b..ae33d157e2f7a 100644
--- a/.github/workflows/sycl-rel-nightly.yml
+++ b/.github/workflows/sycl-rel-nightly.yml
@@ -38,6 +38,7 @@ jobs:
       build_cache_root: "/__w/"
       build_artifact_suffix: default
       build_configure_extra_args: '--hip --cuda'
+      build_image: ghcr.io/intel/llvm/ubuntu2204_build:latest
       merge_ref: ''
       build_ref: sycl-rel-6_0_0
 
@@ -118,8 +119,8 @@ jobs:
     if: ${{ github.repository == 'intel/llvm' && needs.check_for_new_commits.outputs.is_new_commit != 'false' }}
     uses: ./.github/workflows/sycl-windows-build.yml
     with:
+      ref: sycl-rel-6_0_0
       merge_ref: ''
-      build_ref: sycl-rel-6_0_0
 
       # We upload both Linux/Windows build via Github's "Releases"
       # functionality, make sure Linux/Windows names follow the same pattern.
@@ -139,6 +140,7 @@ jobs:
       sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }}
       extra_lit_opts: --param gpu-intel-gen12=True
       ref: sycl-rel-6_0_0
+      merge_ref: ''
 
   cuda-aws-start:
     needs: [ubuntu2204_build]
@@ -156,7 +158,7 @@ jobs:
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.run_id }}-${{ github.run_attempt }}"]'
-      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
+      image: ghcr.io/intel/llvm/ubuntu2204_build:latest
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
       target_devices: cuda:gpu
       ref: sycl-rel-6_0_0
diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl-windows-build.yml
index f7e13df7a97c7..de2d452de7a50 100644
--- a/.github/workflows/sycl-windows-build.yml
+++ b/.github/workflows/sycl-windows-build.yml
@@ -7,9 +7,6 @@ on:
         type: string
         required: false
         default: "default"
-      build_ref:
-        type: string
-        required: false
       build_configure_extra_args:
         type: string
         required: false
@@ -18,6 +15,9 @@ on:
         description: 'Filter matches for the changed files in the PR'
         default: '[llvm, clang, sycl, llvm_spirv, xptifw, libclc, libdevice]'
         required: false
+      ref:
+        type: string
+        required: False
       merge_ref:
         description: |
           Commit-ish to merge post-checkout if non-empty. Must be reachable from
@@ -105,7 +105,7 @@ jobs:
     - uses: ./devops/actions/cached_checkout
       with:
         path: src
-        ref: ${{ inputs.build_ref || github.sha }}
+        ref: ${{ inputs.ref || github.sha }}
         merge_ref: ${{ inputs.merge_ref }}
         cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\"
     - name: Configure
diff --git a/.github/workflows/sycl-windows-run-tests.yml b/.github/workflows/sycl-windows-run-tests.yml
index a1e27f4fda1d0..dbd4d7ff439ed 100644
--- a/.github/workflows/sycl-windows-run-tests.yml
+++ b/.github/workflows/sycl-windows-run-tests.yml
@@ -18,6 +18,13 @@ on:
       ref:
         type: string
         required: False
+      merge_ref:
+        description: |
+          Commit-ish to merge post-checkout if non-empty. Must be reachable from
+          the default_branch input paramter.
+        type: string
+        default: 'FETCH_HEAD'
+        required: False
 
       sycl_toolchain_artifact:
         type: string
@@ -68,7 +75,8 @@ jobs:
     - uses: ./devops/actions/cached_checkout
       with:
         path: llvm
-        ref: ${{ inputs.build_ref || github.sha }}
+        ref: ${{ inputs.ref || github.sha }}
+        merge_ref: ${{ inputs.merge_ref }}
         cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\"
     - name: Download compiler toolchain
       uses: actions/download-artifact@v4
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a25659459a5b4..10e3a4920fda7 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -10706,38 +10706,21 @@ static void getSPIRVBackendOpts(const llvm::opt::ArgList &TCArgs,
                                 ArgStringList &BackendArgs) {
   BackendArgs.push_back(TCArgs.MakeArgString("-filetype=obj"));
   BackendArgs.push_back(
-      TCArgs.MakeArgString("-mtriple=spirv64-unknown-unknown"));
-  // TODO: Optimization level is currently forced to -O0 due to some testing
-  // issues. Update optimization level after testing issues are resolved.
-  BackendArgs.push_back(TCArgs.MakeArgString("-O0"));
+      TCArgs.MakeArgString("-mtriple=spirv64v1.6-unknown-unknown"));
   BackendArgs.push_back(
       TCArgs.MakeArgString("--avoid-spirv-capabilities=Shader"));
   BackendArgs.push_back(
       TCArgs.MakeArgString("--translator-compatibility-mode"));
-
-  // TODO: There is some overlap between the lists of extensions in SPIR-V
-  // backend and SPIR-V Trnaslator). We will try to combine them when SPIR-V
-  // backdn is ready.
-  std::string ExtArg("--spirv-ext=");
-  std::string DefaultExtArg =
-      "+SPV_EXT_shader_atomic_float_add,+SPV_EXT_shader_atomic_float_min_max"
-      ",+SPV_KHR_no_integer_wrap_decoration,+SPV_KHR_float_controls"
-      ",+SPV_KHR_expect_assume,+SPV_KHR_linkonce_odr";
-  std::string INTELExtArg = ",+SPV_INTEL_subgroups,+SPV_INTEL_function_pointers"
-                            ",+SPV_INTEL_arbitrary_precision_integers"
-                            ",+SPV_INTEL_variable_length_array";
-  ExtArg = ExtArg + DefaultExtArg + INTELExtArg;
-
-  // Other args
-  ExtArg += ",+SPV_INTEL_bfloat16_conversion"
-            ",+SPV_KHR_uniform_group_instructions"
-            ",+SPV_INTEL_optnone"
-            ",+SPV_KHR_subgroup_rotate"
-            ",+SPV_INTEL_usm_storage_classes"
-            ",+SPV_EXT_shader_atomic_float16_add"
-            ",+SPV_KHR_bit_instructions";
-
-  BackendArgs.push_back(TCArgs.MakeArgString(ExtArg));
+  // TODO: A list of SPIR-V extensions that are supported by the SPIR-V backend
+  // is growing. Let's postpone the decision on which extensions to enable until
+  // - the list is stable, and
+  // - we decide on a mapping of user requested extensions into backend's ones.
+  // Meanwhile we enable all the SPIR-V backend extensions.
+  BackendArgs.push_back(TCArgs.MakeArgString("--spirv-ext=all"));
+  // TODO:
+  // - handle -Xspirv-translator option to avoid "argument unused during
+  // compilation" error
+  // - handle --spirv-ext=+<extension> and --spirv-ext=-<extension> options
 }
 
 // Utility function to gather all llvm-spirv options.
diff --git a/clang/test/Driver/sycl-linker-wrapper-image.cpp b/clang/test/Driver/sycl-linker-wrapper-image.cpp
index fb81abd4b7ef0..8c88047e9e649 100644
--- a/clang/test/Driver/sycl-linker-wrapper-image.cpp
+++ b/clang/test/Driver/sycl-linker-wrapper-image.cpp
@@ -53,12 +53,12 @@ int main() {
 // CHECK-DAG: @SYCL_PropSetName.3 = internal unnamed_addr constant [25 x i8] c"SYCL/device requirements\00"
 // CHECK-DAG: @SYCL_PropSetName.4 = internal unnamed_addr constant [22 x i8] c"SYCL/kernel param opt\00"
 // CHECK-DAG: @__sycl_offload_prop_sets_arr.5 = internal constant [3 x %_pi_device_binary_property_set_struct] [%_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName, ptr @__sycl_offload_prop_sets_arr, ptr getelementptr ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr, i64 0, i64 1) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.3, ptr @__sycl_offload_prop_sets_arr.2, ptr getelementptr ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr.2, i64 0, i64 1) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.4, ptr null, ptr null }]
-// CHECK-DAG: @.sycl_offloading.0.data = internal unnamed_addr constant [772 x i8] 
+// CHECK-DAG: @.sycl_offloading.0.data = internal unnamed_addr constant [912 x i8]
 // CHECK-DAG: @__sycl_offload_entry_name = internal unnamed_addr constant [25 x i8] c"_ZTSZ4mainE11fake_kernel\00"
 // CHECK-DAG: @__sycl_offload_entries_arr = internal constant [1 x %struct.__tgt_offload_entry] [%struct.__tgt_offload_entry { ptr null, ptr @__sycl_offload_entry_name, i64 0, i32 0, i32 0 }]
-// CHECK-DAG: @.sycl_offloading.0.info = internal local_unnamed_addr constant [2 x i64] [i64 ptrtoint (ptr @.sycl_offloading.0.data to i64), i64 772], section ".tgtimg", align 16
+// CHECK-DAG: @.sycl_offloading.0.info = internal local_unnamed_addr constant [2 x i64] [i64 ptrtoint (ptr @.sycl_offloading.0.data to i64), i64 912], section ".tgtimg", align 16
 // CHECK-DAG: @llvm.used = appending global [1 x ptr] [ptr @.sycl_offloading.0.info], section "llvm.metadata"
-// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr ([772 x i8], ptr @.sycl_offloading.0.data, i64 0, i64 772), ptr @__sycl_offload_entries_arr, ptr getelementptr ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 0, i64 1), ptr @__sycl_offload_prop_sets_arr.5, ptr getelementptr ([3 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.5, i64 0, i64 3) }]
+// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr ([912 x i8], ptr @.sycl_offloading.0.data, i64 0, i64 912), ptr @__sycl_offload_entries_arr, ptr getelementptr ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 0, i64 1), ptr @__sycl_offload_prop_sets_arr.5, ptr getelementptr ([3 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.5, i64 0, i64 3) }]
 // CHECK-DAG: @.sycl_offloading.descriptor = internal constant %__sycl.tgt_bin_desc { i16 1, i16 1, ptr @.sycl_offloading.device_images, ptr null, ptr null }
 // CHECK-DAG: @llvm.global_ctors = {{.*}} { i32 1, ptr @sycl.descriptor_reg, ptr null }]
 // CHECK-DAG: @llvm.global_dtors = {{.*}} { i32 1, ptr @sycl.descriptor_unreg, ptr null }]
diff --git a/clang/test/Driver/sycl-spirv-backend.cpp b/clang/test/Driver/sycl-spirv-backend.cpp
index db159f6deafb5..7697c1055b3d2 100644
--- a/clang/test/Driver/sycl-spirv-backend.cpp
+++ b/clang/test/Driver/sycl-spirv-backend.cpp
@@ -3,4 +3,4 @@
 ///
 // RUN: %clangxx -fsycl -fsycl-use-spirv-backend-for-spirv-gen -### %s 2>&1 | FileCheck %s
 
-// CHECK: llc{{.*}} "-filetype=obj" "-mtriple=spirv64-unknown-unknown" "-O0" "--avoid-spirv-capabilities=Shader" "--translator-compatibility-mode" "--spirv-ext=
+// CHECK: llc{{.*}} "-filetype=obj" "-mtriple=spirv64{{[^-]*}}-unknown-unknown" "--avoid-spirv-capabilities=Shader" "--translator-compatibility-mode" "--spirv-ext=
diff --git a/devops/bandit.config b/devops/bandit.config
index 49de9695a878b..4e501feef37ef 100644
--- a/devops/bandit.config
+++ b/devops/bandit.config
@@ -40,7 +40,6 @@
 # B317 : xml_bad_sax
 # B318 : xml_bad_minidom
 # B319 : xml_bad_pulldom
-# B320 : xml_bad_etree
 # B321 : ftplib
 # B323 : unverified_context
 # B324 : hashlib_new_insecure_functions
@@ -53,7 +52,6 @@
 # B407 : import_xml_expat
 # B408 : import_xml_minidom
 # B409 : import_xml_pulldom
-# B410 : import_lxml
 # B411 : import_xmlrpclib
 # B412 : import_httpoxy
 # B413 : import_pycrypto
@@ -83,7 +81,7 @@
 # IPAS Required Checkers. Do not disable these
 # Additional checkers may be added if desired
 tests:
-  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413']
+  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413']
 
 # (optional) list skipped test IDs here, eg '[B101, B406]':
 # The following checkers are not required but be added to tests list if desired
diff --git a/devops/containers/ubuntu2404_base.Dockerfile b/devops/containers/ubuntu2404_base.Dockerfile
index 7af9ccfec1e5f..3cdad5b74366e 100644
--- a/devops/containers/ubuntu2404_base.Dockerfile
+++ b/devops/containers/ubuntu2404_base.Dockerfile
@@ -8,6 +8,13 @@ USER root
 COPY scripts/install_build_tools.sh /install.sh
 RUN /install.sh
 
+# libzstd-dev installed by default on Ubuntu 24.04 is not compiled with -fPIC flag.
+# This causes linking errors when building SYCL runtime.
+# Bug: https://github.com/intel/llvm/issues/15935
+# Workaround: build zstd from sources with -fPIC flag.
+COPY scripts/build_zstd_1_5_6_ub24.sh /build_zstd_1_5_6_ub24.sh
+RUN /build_zstd_1_5_6_ub24.sh
+
 COPY scripts/create-sycl-user.sh /user-setup.sh
 RUN /user-setup.sh
 
diff --git a/devops/containers/ubuntu2404_build.Dockerfile b/devops/containers/ubuntu2404_build.Dockerfile
index 5391030df006a..c659eabbced51 100644
--- a/devops/containers/ubuntu2404_build.Dockerfile
+++ b/devops/containers/ubuntu2404_build.Dockerfile
@@ -8,6 +8,13 @@ USER root
 COPY scripts/install_build_tools.sh /install.sh
 RUN /install.sh
 
+# libzstd-dev installed by default on Ubuntu 24.04 is not compiled with -fPIC flag.
+# This causes linking errors when building SYCL runtime.
+# Bug: https://github.com/intel/llvm/issues/15935
+# Workaround: build zstd from sources with -fPIC flag.
+COPY scripts/build_zstd_1_5_6_ub24.sh /build_zstd_1_5_6_ub24.sh
+RUN /build_zstd_1_5_6_ub24.sh
+
 SHELL ["/bin/bash", "-ec"]
 
 # Make the directory if it doesn't exist yet.
diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json
index 28cc58e1c7947..20d3c7c9aa08b 100644
--- a/devops/dependencies-igc-dev.json
+++ b/devops/dependencies-igc-dev.json
@@ -1,10 +1,10 @@
 {
   "linux": {
     "igc_dev": {
-      "github_tag": "igc-dev-e0d826a",
-      "version": "e0d826a",
-      "updated_at": "2024-12-17T21:18:30Z",
-      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2327583926/zip",
+      "github_tag": "igc-dev-97b3d8f",
+      "version": "97b3d8f",
+      "updated_at": "2025-01-08T17:43:30Z",
+      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/2403247641/zip",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     }
   }
diff --git a/devops/dependencies.json b/devops/dependencies.json
index f5976861016e0..79892387df4c1 100644
--- a/devops/dependencies.json
+++ b/devops/dependencies.json
@@ -1,15 +1,15 @@
 {
   "linux": {
     "compute_runtime": {
-      "github_tag": "24.48.31907.7",
-      "version": "24.48.31907.7",
-      "url": "https://github.com/intel/compute-runtime/releases/tag/24.48.31907.7",
+      "github_tag": "24.52.32224.5",
+      "version": "24.52.32224.5",
+      "url": "https://github.com/intel/compute-runtime/releases/tag/24.52.32224.5",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "igc": {
-      "github_tag": "v2.2.3",
-      "version": "2.2.3",
-      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/v2.2.3",
+      "github_tag": "v2.5.6",
+      "version": "2.5.6",
+      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/v2.5.6",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "cm": {
diff --git a/devops/scripts/build_zstd_1_5_6_ub24.sh b/devops/scripts/build_zstd_1_5_6_ub24.sh
new file mode 100755
index 0000000000000..68a947dfb43a0
--- /dev/null
+++ b/devops/scripts/build_zstd_1_5_6_ub24.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Script to build and install zstd 1.5.6 on Ubuntu 24, with -fPIC flag.
+# The default installation of zstd on Ubuntu 24 does not have -fPIC flag
+# enabled, which is required for building DPC++ in shared libraries mode.
+
+# Function to check if the OS is Ubuntu 24
+check_os() {
+    . /etc/os-release
+    if [[ "$NAME" != "Ubuntu" || "$VERSION_ID" != "24.04" ]]; then
+        echo "Warning: This script has only been tested with Ubuntu 24."
+    fi
+}
+
+# Function to install packages with or without sudo
+install_packages() {
+    if [ "$USE_SUDO" = true ]; then
+        sudo apt-get update
+        sudo apt-get install -y build-essential wget
+    else
+        apt-get update
+        apt-get install -y build-essential wget
+    fi
+}
+
+# Function to uninstall libzstd-dev if installed
+uninstall_libzstd_dev() {
+    if dpkg -l | grep -q libzstd-dev; then
+        if [ "$USE_SUDO" = true ]; then
+            sudo apt-get remove -y libzstd-dev
+        else
+            apt-get remove -y libzstd-dev
+        fi
+    fi
+}
+
+# Function to build a shared library by linking zstd static lib.
+# This is used to verify that zstd is built correctly, with -fPIC flag.
+build_test_program() {
+    cat <<EOF > test_zstd.c
+      #include <zstd.h>
+      int main() {
+        ZSTD_CCtx* cctx = ZSTD_createCCtx();
+        ZSTD_freeCCtx(cctx);
+        return 0;
+      }
+EOF
+
+    # Try to use zstd's static library with -fPIC
+    gcc test_zstd.c -lzstd -fPIC -shared
+    if [ $? -ne 0 ]; then
+        echo "zstd installation verification failed."
+    else
+        echo "zstd installation verification passed."
+    fi
+
+    # There won't be a.out file if verification failed.
+    rm test_zstd.c a.out || true
+}
+
+# Check the OS
+check_os
+
+# Set USE_SUDO to true or false based on your preference
+USE_SUDO=true
+
+# Install necessary build tools
+install_packages
+
+# Uninstall libzstd-dev package if installed
+uninstall_libzstd_dev
+
+# Define the version and URL for zstd
+ZSTD_VERSION="1.5.6"
+ZSTD_URL="https://github.com/facebook/zstd/releases/download/v$ZSTD_VERSION/zstd-$ZSTD_VERSION.tar.gz"
+
+# Create a directory for the source code
+mkdir -p zstd_build
+cd zstd_build
+
+# Download and extract zstd source code
+wget $ZSTD_URL
+tar -xzf zstd-$ZSTD_VERSION.tar.gz
+cd zstd-$ZSTD_VERSION
+
+# Build zstd with -fPIC flag.
+CFLAGS="-fPIC" CXXFLAGS="-fPIC" make
+if [ $? -ne 0 ]; then
+    echo "Error: make failed."
+    exit 1
+fi
+
+# Install zstd.
+if [ "$USE_SUDO" = true ]; then
+    sudo make install
+else
+    make install
+fi
+if [ $? -ne 0 ]; then
+    echo "Error: make install failed."
+    exit 1
+fi
+
+# Verify zstd installation.
+build_test_program
+
+# Clean up
+rm -rf zstd_build
diff --git a/devops/scripts/install_build_tools.sh b/devops/scripts/install_build_tools.sh
index 37d9761751ebb..37e2c7e15ac4b 100755
--- a/devops/scripts/install_build_tools.sh
+++ b/devops/scripts/install_build_tools.sh
@@ -24,5 +24,5 @@ apt update && apt install -yqq \
       jq \
       curl \
       libhwloc-dev \
-      libzstd-dev
-
+      libzstd-dev \
+      time
diff --git a/devops/scripts/install_drivers.sh b/devops/scripts/install_drivers.sh
index 570f78091d9f2..01c2dde54d6d0 100755
--- a/devops/scripts/install_drivers.sh
+++ b/devops/scripts/install_drivers.sh
@@ -140,7 +140,7 @@ InstallIGFX () {
   get_release oneapi-src/level-zero $L0_TAG \
     | grep ".*$UBUNTU_VER.*deb" \
     | wget -qi -
-  dpkg -i --force-overwrite *.deb && rm *.deb *.sum
+  dpkg -i --force-all *.deb && rm *.deb *.sum
   mkdir -p /usr/local/lib/igc/
   echo "$IGC_TAG" > /usr/local/lib/igc/IGCTAG.txt
   if [ "$IS_IGC_DEV" == "Yes" ]; then
@@ -149,21 +149,21 @@ InstallIGFX () {
     # Backup and install it from release igc as a temporarily workaround
     # while we working to resolve the issue.
     echo "Backup libopencl-clang"
-    cp -d /usr/local/lib/libopencl-clang.so.14*  .
+    cp -d /usr/local/lib/libopencl-clang2.so.14*  .
     echo "Download IGC dev git hash $IGC_DEV_VER"
     get_pre_release_igfx $IGC_DEV_URL $IGC_DEV_VER
     echo "Install IGC dev git hash $IGC_DEV_VER"
     # New dev IGC packaged iga64 conflicting with iga64 from intel-igc-media
     # force overwrite to workaround it first.
-    dpkg -i --force-overwrite *.deb
+    dpkg -i --force-all *.deb
     echo "Install libopencl-clang"
     # Workaround only, will download deb and install with dpkg once fixed.
-    cp -d libopencl-clang.so.14*  /usr/local/lib/
+    cp -d libopencl-clang2.so.14*  /usr/local/lib/
     rm /usr/local/lib/libigc.so /usr/local/lib/libigc.so.1* && \
        ln -s /usr/local/lib/libigc.so.2 /usr/local/lib/libigc.so && \
        ln -s /usr/local/lib/libigc.so.2 /usr/local/lib/libigc.so.1
     echo "Clean up"
-    rm *.deb libopencl-clang.so.14*
+    rm *.deb libopencl-clang2.so.14*
     echo "$IGC_DEV_TAG" > /usr/local/lib/igc/IGCTAG.txt
   fi
 }
diff --git a/devops/scripts/update_drivers.py b/devops/scripts/update_drivers.py
index 4c3cbb791c851..e9b14f87d5572 100644
--- a/devops/scripts/update_drivers.py
+++ b/devops/scripts/update_drivers.py
@@ -48,16 +48,18 @@ def uplift_linux_igfx_driver(config, platform_tag, igc_dev_only):
     config[platform_tag]['compute_runtime']['version'] = compute_runtime['tag_name']
     config[platform_tag]['compute_runtime']['url'] = 'https://github.com/intel/compute-runtime/releases/tag/' + compute_runtime['tag_name']
 
-    for a in compute_runtime['assets']:
-        if a['name'].endswith('.sum'):
-            deps = str(urlopen(a['browser_download_url']).read())
-            m = re.search(r"intel-igc-core_([0-9\.]*)_amd64", deps)
-            if m is not None:
-                ver = m.group(1)
-                config[platform_tag]['igc']['github_tag'] = 'igc-' + ver
-                config[platform_tag]['igc']['version'] = ver
-                config[platform_tag]['igc']['url'] = 'https://github.com/intel/intel-graphics-compiler/releases/tag/igc-' + ver
-                break
+    m = re.search(
+        re.escape("https://github.com/intel/intel-graphics-compiler/releases/tag/")
+        + r"(v[\.0-9]+)",
+        compute_runtime["body"],
+    )
+    if m is not None:
+        ver = m.group(1)
+        config[platform_tag]["igc"]["github_tag"] = ver
+        config[platform_tag]["igc"]["version"] = ver
+        config[platform_tag]["igc"]["url"] = (
+            "https://github.com/intel/intel-graphics-compiler/releases/tag/" + ver
+        )
 
     cm = get_latest_release('intel/cm-compiler')
     config[platform_tag]['cm']['github_tag'] = cm['tag_name']
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index 4b9b76d33e393..6f4782725d514 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -247,6 +247,8 @@
   }                                                                            \
   _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, FUNCTION, half, half)
 
+#pragma OPENCL EXTENSION cl_khr_fp16 : disable
+
 #else
 
 #define _CLC_DEFINE_UNARY_BUILTIN_FP16(FUNCTION)
diff --git a/libclc/clspv/lib/math/fma.cl b/libclc/clspv/lib/math/fma.cl
index 556bd837a27a7..e6251db4e92db 100644
--- a/libclc/clspv/lib/math/fma.cl
+++ b/libclc/clspv/lib/math/fma.cl
@@ -269,3 +269,14 @@ _CLC_DEF _CLC_OVERLOAD float fma(float a, float b, float c) {
                   ((uint)st_fma.mantissa.lo & 0x7fffff));
 }
 _CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, fma, float, float, float)
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half fma(half a, half b, half c) {
+  return (half)mad((float)a, (float)b, (float)c);
+}
+_CLC_TERNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, fma, half, half, half)
+
+#endif
diff --git a/libclc/generic/gen_convert_common.py b/libclc/generic/gen_convert_common.py
index d4399adda10d9..98cd83fc1a572 100644
--- a/libclc/generic/gen_convert_common.py
+++ b/libclc/generic/gen_convert_common.py
@@ -47,25 +47,31 @@
                'half'  : 2, 'float'  : 4,
                'double': 8}
 
-limit_max = {'char'  : 'CHAR_MAX',
-             'schar' : 'CHAR_MAX',
-             'uchar' : 'UCHAR_MAX',
-             'short' : 'SHRT_MAX',
-             'ushort': 'USHRT_MAX',
-             'int'   : 'INT_MAX',
-             'uint'  : 'UINT_MAX',
-             'long'  : 'LONG_MAX',
-             'ulong' : 'ULONG_MAX'}
+limit_max = {
+    "char": "CHAR_MAX",
+    "schar": "CHAR_MAX",
+    "uchar": "UCHAR_MAX",
+    "short": "SHRT_MAX",
+    "ushort": "USHRT_MAX",
+    "int": "INT_MAX",
+    "uint": "UINT_MAX",
+    "long": "LONG_MAX",
+    "ulong": "ULONG_MAX",
+    "half": "0x1.ffcp+15",
+}
 
-limit_min = {'char'  : 'CHAR_MIN',
-             'schar' : 'CHAR_MIN',
-             'uchar' : '0',
-             'short' : 'SHRT_MIN',
-             'ushort': '0',
-             'int'   : 'INT_MIN',
-             'uint'  : '0',
-             'long'  : 'LONG_MIN',
-             'ulong' : '0'}
+limit_min = {
+    "char": "CHAR_MIN",
+    "schar": "CHAR_MIN",
+    "uchar": "0",
+    "short": "SHRT_MIN",
+    "ushort": "0",
+    "int": "INT_MIN",
+    "uint": "0",
+    "long": "LONG_MIN",
+    "ulong": "0",
+    "half": "-0x1.ffcp+15",
+}
 
 
 def conditional_guard(src, dst):
diff --git a/libclc/generic/include/clc/convert.h b/libclc/generic/include/clc/convert.h
index eac4f4216ee43..687a685e70534 100644
--- a/libclc/generic/include/clc/convert.h
+++ b/libclc/generic/include/clc/convert.h
@@ -23,10 +23,19 @@
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX)
 
-#ifdef cl_khr_fp64
+#if defined(cl_khr_fp64) && defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX)                            \
+  _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)                                 \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)                          \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX)
+#elif defined(cl_khr_fp64)
 #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)
+#elif defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX)                            \
+  _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)                                 \
+  _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX)
 #else
 #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX)
@@ -43,11 +52,19 @@
   _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM(float, SUFFIX)
 
-#ifdef cl_khr_fp64
+#if defined(cl_khr_fp64) && defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX)                                         \
+  _CLC_VECTOR_CONVERT_TO1(SUFFIX)                                              \
+  _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)                                     \
+  _CLC_VECTOR_CONVERT_FROM(half, SUFFIX)
+#elif defined(cl_khr_fp64)
 #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
   _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
   _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)
-#else
+#elif defined(cl_khr_fp16)
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX)                                         \
+  _CLC_VECTOR_CONVERT_TO1(SUFFIX)                                              \
+  _CLC_VECTOR_CONVERT_FROM(half, SUFFIX)
 #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
   _CLC_VECTOR_CONVERT_TO1(SUFFIX)
 #endif
diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py
index a1220fd3c2664..38817a3fe4ed7 100644
--- a/libclc/generic/lib/gen_convert.py
+++ b/libclc/generic/lib/gen_convert.py
@@ -65,21 +65,21 @@
     "uint",
     "long",
     "ulong",
+    "half",
     "float",
     "double",
 ]
 int_types = ["char", "uchar", "short", "ushort", "int", "uint", "long", "ulong"]
 unsigned_types = ["uchar", "ushort", "uint", "ulong"]
-float_types = ["float", "double"]
+float_types = ["half", "float", "double"]
 int64_types = ["long", "ulong"]
 float64_types = ["double"]
+float16_types = ["half"]
 vector_sizes = ["", "2", "3", "4", "8", "16"]
 half_sizes = [("2", ""), ("4", "2"), ("8", "4"), ("16", "8")]
 
 saturation = ["", "_sat"]
 rounding_modes = ["_rtz", "_rte", "_rtp", "_rtn"]
-float_prefix = {"float": "FLT_", "double": "DBL_"}
-float_suffix = {"float": "f", "double": ""}
 
 bool_type = {
     "char": "char",
@@ -90,6 +90,7 @@
     "uint": "int",
     "long": "long",
     "ulong": "long",
+    "half": "short",
     "float": "int",
     "double": "long",
 }
@@ -114,6 +115,7 @@
     "uint": 4,
     "long": 8,
     "ulong": 8,
+    "half": 2,
     "float": 4,
     "double": 8,
 }
@@ -127,6 +129,7 @@
     "uint": "UINT_MAX",
     "long": "LONG_MAX",
     "ulong": "ULONG_MAX",
+    "half": "0x1.ffcp+15",
 }
 
 limit_min = {
@@ -138,24 +141,33 @@
     "uint": "0",
     "long": "LONG_MIN",
     "ulong": "0",
+    "half": "-0x1.ffcp+15",
 }
 
 
 def conditional_guard(src, dst):
     int64_count = 0
     float64_count = 0
+    float16_count = 0
     if src in int64_types:
         int64_count = int64_count + 1
     elif src in float64_types:
         float64_count = float64_count + 1
+    elif src in float16_types:
+        float16_count = float16_count + 1
     if dst in int64_types:
         int64_count = int64_count + 1
     elif dst in float64_types:
         float64_count = float64_count + 1
+    elif dst in float16_types:
+        float16_count = float16_count + 1
     if float64_count > 0:
         # In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be
         print("#ifdef cl_khr_fp64")
         return True
+    elif float16_count > 0:
+        print("#if defined cl_khr_fp16")
+        return True
     elif int64_count > 0:
         print("#if defined cles_khr_int64 || !defined(__EMBEDDED_PROFILE__)")
         return True
@@ -198,6 +210,10 @@ def conditional_guard(src, dst):
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl
index e7ceaa14c3a38..d71d10024b180 100644
--- a/libclc/generic/lib/math/acos.cl
+++ b/libclc/generic/lib/math/acos.cl
@@ -171,3 +171,11 @@ _CLC_OVERLOAD _CLC_DEF double acos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acos, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acos)
+
+#endif
diff --git a/libclc/generic/lib/math/acosh.cl b/libclc/generic/lib/math/acosh.cl
index e433b133ebb76..977c2e929b34c 100644
--- a/libclc/generic/lib/math/acosh.cl
+++ b/libclc/generic/lib/math/acosh.cl
@@ -125,3 +125,11 @@ _CLC_OVERLOAD _CLC_DEF double acosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acosh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acosh)
+
+#endif
diff --git a/libclc/generic/lib/math/acospi.cl b/libclc/generic/lib/math/acospi.cl
index 753ee1cc3687f..5aa8a083df4e9 100644
--- a/libclc/generic/lib/math/acospi.cl
+++ b/libclc/generic/lib/math/acospi.cl
@@ -170,3 +170,11 @@ _CLC_OVERLOAD _CLC_DEF double acospi(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, acospi, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(acospi)
+
+#endif
diff --git a/libclc/generic/lib/math/asinh.cl b/libclc/generic/lib/math/asinh.cl
index 8fa118d77899c..13963b2d4d9ca 100644
--- a/libclc/generic/lib/math/asinh.cl
+++ b/libclc/generic/lib/math/asinh.cl
@@ -291,3 +291,11 @@ _CLC_OVERLOAD _CLC_DEF double asinh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, asinh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(asinh)
+
+#endif
diff --git a/libclc/generic/lib/math/atan.cl b/libclc/generic/lib/math/atan.cl
index a07019751a118..21b24a25a5210 100644
--- a/libclc/generic/lib/math/atan.cl
+++ b/libclc/generic/lib/math/atan.cl
@@ -181,3 +181,12 @@ _CLC_OVERLOAD _CLC_DEF double atan(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan, double);
 
 #endif // cl_khr_fp64
+
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atan)
+
+#endif
diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl
index d8b209b27e696..fd57a492ed414 100644
--- a/libclc/generic/lib/math/atan2.cl
+++ b/libclc/generic/lib/math/atan2.cl
@@ -235,3 +235,11 @@ _CLC_OVERLOAD _CLC_DEF double atan2(double y, double x)
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2, double, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2)
+
+#endif
diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl
index a6b7a7eadbb97..19ab7346bb70d 100644
--- a/libclc/generic/lib/math/atan2pi.cl
+++ b/libclc/generic/lib/math/atan2pi.cl
@@ -219,3 +219,11 @@ _CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) {
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2pi)
+
+#endif
diff --git a/libclc/generic/lib/math/atanh.cl b/libclc/generic/lib/math/atanh.cl
index de1c3bf5f2fca..10bad190cc0dc 100644
--- a/libclc/generic/lib/math/atanh.cl
+++ b/libclc/generic/lib/math/atanh.cl
@@ -111,3 +111,11 @@ _CLC_OVERLOAD _CLC_DEF double atanh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atanh)
+
+#endif
diff --git a/libclc/generic/lib/math/atanpi.cl b/libclc/generic/lib/math/atanpi.cl
index 5df1e04e16492..8522acf349933 100644
--- a/libclc/generic/lib/math/atanpi.cl
+++ b/libclc/generic/lib/math/atanpi.cl
@@ -180,3 +180,11 @@ _CLC_OVERLOAD _CLC_DEF double atanpi(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atanpi, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(atanpi)
+
+#endif
diff --git a/libclc/generic/lib/math/cbrt.cl b/libclc/generic/lib/math/cbrt.cl
index f5a9068600c92..76ba2a7697121 100644
--- a/libclc/generic/lib/math/cbrt.cl
+++ b/libclc/generic/lib/math/cbrt.cl
@@ -149,3 +149,11 @@ _CLC_OVERLOAD _CLC_DEF double cbrt(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cbrt, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cbrt)
+
+#endif
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index 4c76f23b9a4c8..eee9c9fcaa2d4 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -369,3 +369,15 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny)
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_rootn(half x, int y) {
+    return (half)__clc_rootn((float)x, y);
+}
+
+_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_rootn, half, int);
+
+#endif
diff --git a/libclc/generic/lib/math/clc_sw_binary.inc b/libclc/generic/lib/math/clc_sw_binary.inc
index 2005d1da66441..b701d78878c6a 100644
--- a/libclc/generic/lib/math/clc_sw_binary.inc
+++ b/libclc/generic/lib/math/clc_sw_binary.inc
@@ -2,11 +2,25 @@
 
 #define __CLC_SW_FUNC(x) __CLC_CONCAT(__clc_, x)
 
-// TODO: Enable half precision when the sw routine is implemented
 #if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return __CLC_SW_FUNC(__CLC_FUNC)(x, y);
 }
+#elif __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x,
+                                                __CLC_GENTYPE y) {
+  return convert_half(
+      __CLC_SW_FUNC(__CLC_FUNC)(convert_float(x), convert_float(y)));
+}
+#else
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x,
+                                                __CLC_GENTYPE y) {
+  return __CLC_XCONCAT(convert_half, __CLC_VECSIZE)(__CLC_SW_FUNC(__CLC_FUNC)(
+      __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(x),
+      __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(y)));
+}
+#endif
 #endif
 
 #undef __CLC_SW_FUNC
diff --git a/libclc/generic/lib/math/clc_sw_unary.inc b/libclc/generic/lib/math/clc_sw_unary.inc
index 842e7545b19b9..8767a2b134d09 100644
--- a/libclc/generic/lib/math/clc_sw_unary.inc
+++ b/libclc/generic/lib/math/clc_sw_unary.inc
@@ -4,9 +4,19 @@
 #define __CLC_SW_FUNC __CLC_XCONCAT(__clc_, __CLC_FUNC)
 #endif
 
-// TODO: Enable half precision when the sw routine is implemented
 #if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
   return __CLC_SW_FUNC(x);
 }
+#elif __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
+  return convert_half(__CLC_SW_FUNC(convert_float(x)));
+}
+#else
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) {
+  return __CLC_XCONCAT(convert_half, __CLC_VECSIZE)(
+      __CLC_SW_FUNC(__CLC_XCONCAT(convert_float, __CLC_VECSIZE)(x)));
+}
+#endif
 #endif
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index 792eb9ac3f1b5..5e5d43c0990fd 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -42,3 +42,11 @@ _CLC_OVERLOAD _CLC_DEF double cos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cos)
+
+#endif
diff --git a/libclc/generic/lib/math/cosh.cl b/libclc/generic/lib/math/cosh.cl
index 6f932d6a8adbf..84c0505090ec2 100644
--- a/libclc/generic/lib/math/cosh.cl
+++ b/libclc/generic/lib/math/cosh.cl
@@ -190,3 +190,11 @@ _CLC_OVERLOAD _CLC_DEF double cosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cosh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cosh)
+
+#endif
diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl
index 5d1f6e238de49..9556cc04e4167 100644
--- a/libclc/generic/lib/math/cospi.cl
+++ b/libclc/generic/lib/math/cospi.cl
@@ -40,3 +40,11 @@ _CLC_OVERLOAD _CLC_DEF double cospi(double x) {
 }
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double);
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(cospi)
+
+#endif
diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl
index 1cc4c98de4c09..b36cb0d575d3a 100644
--- a/libclc/generic/lib/math/exp.cl
+++ b/libclc/generic/lib/math/exp.cl
@@ -43,3 +43,11 @@ _CLC_OVERLOAD _CLC_DEF double exp(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(exp)
+
+#endif
diff --git a/libclc/generic/lib/math/expm1.cl b/libclc/generic/lib/math/expm1.cl
index 0b8dbb6b66383..fc52c6f42484a 100644
--- a/libclc/generic/lib/math/expm1.cl
+++ b/libclc/generic/lib/math/expm1.cl
@@ -21,3 +21,11 @@ _CLC_OVERLOAD _CLC_DEF double expm1(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(expm1)
+
+#endif
diff --git a/libclc/generic/lib/math/fdim.inc b/libclc/generic/lib/math/fdim.inc
index 9aa3496b18902..98cbef6076667 100644
--- a/libclc/generic/lib/math/fdim.inc
+++ b/libclc/generic/lib/math/fdim.inc
@@ -69,3 +69,28 @@ __CLC_FDIM_VEC(16)
 #undef __CLC_FDIM_VEC
 #endif
 #endif
+
+#if __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+#define QNANBITPATT_FP16 ((short)0x7e00)
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x,
+                                          private __CLC_GENTYPE y) {
+  short n = -(isnan(x) | isnan(y)) & QNANBITPATT_FP16;
+  short r = -(x > y) & as_short(x - y);
+  return as_half((short)(n | r));
+}
+#define __CLC_FDIM_VEC(width)                                                  \
+  _CLC_OVERLOAD _CLC_DEF half##width fdim(half##width x, half##width y) {      \
+    /* See comment in float implementation for explanation. */                 \
+    short##width n = ~((x == x) & (y == y)) & QNANBITPATT_FP16;                \
+    short##width r = (x > y) & as_short##width(x - y);                         \
+    return as_half##width(n | r);                                              \
+  }
+__CLC_FDIM_VEC(2)
+__CLC_FDIM_VEC(3)
+__CLC_FDIM_VEC(4)
+__CLC_FDIM_VEC(8)
+__CLC_FDIM_VEC(16)
+#undef __CLC_FDIM_VEC
+#endif
+#endif
diff --git a/libclc/generic/lib/math/frexp.inc b/libclc/generic/lib/math/frexp.inc
index ace8829c6269b..0d938d23c26a1 100644
--- a/libclc/generic/lib/math/frexp.inc
+++ b/libclc/generic/lib/math/frexp.inc
@@ -42,6 +42,17 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE
 }
 #endif
 
+#if __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x,
+                                           __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
+  return (__CLC_GENTYPE)frexp((float)x, ep);
+}
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, __CLC_GENTYPE, frexp,
+                      __CLC_GENTYPE, __CLC_ADDRESS_SPACE, __CLC_INTN);
+#endif
+#endif
+
 #if __CLC_FPSIZE == 64
 #ifdef __CLC_SCALAR
 #define __CLC_AS_LONGN as_long
diff --git a/libclc/generic/lib/math/ilogb.cl b/libclc/generic/lib/math/ilogb.cl
index 39b82cfdc22cc..f16b4404fbebe 100644
--- a/libclc/generic/lib/math/ilogb.cl
+++ b/libclc/generic/lib/math/ilogb.cl
@@ -71,3 +71,15 @@ _CLC_OVERLOAD _CLC_DEF int ilogb(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF int ilogb(half x) {
+    return ilogb((float)x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, half);
+
+#endif
diff --git a/libclc/generic/lib/math/lgamma.cl b/libclc/generic/lib/math/lgamma.cl
index bf9aefc49c4e1..f0476230e63fe 100644
--- a/libclc/generic/lib/math/lgamma.cl
+++ b/libclc/generic/lib/math/lgamma.cl
@@ -41,4 +41,12 @@ _CLC_OVERLOAD _CLC_DEF double lgamma(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma, double)
 
-#endif
\ No newline at end of file
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(lgamma)
+
+#endif
diff --git a/libclc/generic/lib/math/lgamma_r.cl b/libclc/generic/lib/math/lgamma_r.cl
index c459e8a7d097d..49439eb0a95f4 100644
--- a/libclc/generic/lib/math/lgamma_r.cl
+++ b/libclc/generic/lib/math/lgamma_r.cl
@@ -486,6 +486,17 @@ _CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, private int *ip) {
 _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma_r, double, private, int)
 #endif
 
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half lgamma_r(half x, private int *iptr) {
+    return (half)lgamma_r((float)x, iptr);
+}
+
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, lgamma_r, half, private, int);
+
+#endif
 
 #define __CLC_ADDRSPACE global
 #define __CLC_BODY <lgamma_r.inc>
diff --git a/libclc/generic/lib/math/lgamma_r.inc b/libclc/generic/lib/math/lgamma_r.inc
index 0e19ba8fb2c7c..8aa17fbe79bd8 100644
--- a/libclc/generic/lib/math/lgamma_r.inc
+++ b/libclc/generic/lib/math/lgamma_r.inc
@@ -21,12 +21,9 @@
  * THE SOFTWARE.
  */
 
-// TODO: Enable half precision when the base version is implemented.
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, __CLC_ADDRSPACE __CLC_INTN *iptr) {
     __CLC_INTN private_iptr;
     __CLC_GENTYPE ret = lgamma_r(x, &private_iptr);
     *iptr = private_iptr;
     return ret;
 }
-#endif
diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl
index 4c338edee1d33..d7d35c0910e9a 100644
--- a/libclc/generic/lib/math/log10.cl
+++ b/libclc/generic/lib/math/log10.cl
@@ -29,6 +29,10 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif // cl_khr_fp64
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
 _CLC_OVERLOAD _CLC_DEF float log10(float x) {
     return __spirv_ocl_log10(x);
 }
@@ -42,3 +46,11 @@ _CLC_OVERLOAD _CLC_DEF double log10(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log10, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD _CLC_DEF half log10(half x) {
+    return __spirv_ocl_log10(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log10, half);
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/log1p.cl b/libclc/generic/lib/math/log1p.cl
index d4b8eef74bfe6..67f029a14431e 100644
--- a/libclc/generic/lib/math/log1p.cl
+++ b/libclc/generic/lib/math/log1p.cl
@@ -175,3 +175,11 @@ _CLC_OVERLOAD _CLC_DEF double log1p(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log1p, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(log1p)
+
+#endif
diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl
index 1cd2ebcddabf0..b91a13529412f 100644
--- a/libclc/generic/lib/math/log2.cl
+++ b/libclc/generic/lib/math/log2.cl
@@ -29,6 +29,10 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif // cl_khr_fp64
 
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
 _CLC_OVERLOAD _CLC_DEF float log2(float x) {
     return __spirv_ocl_log2(x);
 }
@@ -42,3 +46,11 @@ _CLC_OVERLOAD _CLC_DEF double log2(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD _CLC_DEF half log2(half x) {
+    return __spirv_ocl_log2(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log2, half);
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
index 4e20329f641bb..b8110ca1779a2 100644
--- a/libclc/generic/lib/math/log_base.h
+++ b/libclc/generic/lib/math/log_base.h
@@ -295,3 +295,22 @@ log(double x)
 }
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+_CLC_OVERLOAD _CLC_DEF half
+#if defined(COMPILING_LOG2)
+log2(half x) {
+  return (half)log2((float)x);
+}
+#elif defined(COMPILING_LOG10)
+log10(half x) {
+  return (half)log10((float)x);
+}
+#else
+log(half x) {
+  return (half)log((float)x);
+}
+#endif
+
+#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/logb.cl b/libclc/generic/lib/math/logb.cl
index e77088367dfbd..9683d68b52c3d 100644
--- a/libclc/generic/lib/math/logb.cl
+++ b/libclc/generic/lib/math/logb.cl
@@ -18,3 +18,11 @@ _CLC_OVERLOAD _CLC_DEF double logb(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, logb, double)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(logb)
+
+#endif
diff --git a/libclc/generic/lib/math/pown.inc b/libclc/generic/lib/math/pown.inc
index 2add2c7459de9..84729d90a796f 100644
--- a/libclc/generic/lib/math/pown.inc
+++ b/libclc/generic/lib/math/pown.inc
@@ -1,6 +1,3 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE pown(__CLC_GENTYPE x, __CLC_INTN y) {
   return __clc_pown(x, y);
 }
-#endif
diff --git a/libclc/generic/lib/math/remquo.inc b/libclc/generic/lib/math/remquo.inc
index 32bd41da37ddc..4c1133436b46d 100644
--- a/libclc/generic/lib/math/remquo.inc
+++ b/libclc/generic/lib/math/remquo.inc
@@ -1,9 +1,6 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE remquo(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_ADDRESS_SPACE __CLC_INTN *q) {
   private __CLC_INTN private_q;
   __CLC_GENTYPE ret = __clc_remquo(x, y, &private_q);
   *q = private_q;
   return ret;
 }
-#endif
diff --git a/libclc/generic/lib/math/rootn.inc b/libclc/generic/lib/math/rootn.inc
index f788649685ac9..3f5b00c082cd3 100644
--- a/libclc/generic/lib/math/rootn.inc
+++ b/libclc/generic/lib/math/rootn.inc
@@ -1,6 +1,3 @@
-// TODO: Enable half precision when the sw routine is implemented
-#if __CLC_FPSIZE > 16
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rootn(__CLC_GENTYPE x, __CLC_INTN y) {
   return __clc_rootn(x, y);
 }
-#endif
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index 0ff24e0b21e1e..6a3299bda4073 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -42,3 +42,11 @@ _CLC_OVERLOAD _CLC_DEF double sin(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sin)
+
+#endif
diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/generic/lib/math/sincos.inc
index 177e74e605c51..b5a35c21f81f0 100644
--- a/libclc/generic/lib/math/sincos.inc
+++ b/libclc/generic/lib/math/sincos.inc
@@ -1,5 +1,3 @@
-// TODO: Enable half precision when sin/cos is implemented
-#if __CLC_FPSIZE > 16
 #define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \
   _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \
     return __spirv_ocl_sincos(x, cosval); \
@@ -13,4 +11,3 @@ __CLC_DECLARE_SINCOS(generic, __CLC_GENTYPE)
 #endif
 
 #undef __CLC_DECLARE_SINCOS
-#endif
diff --git a/libclc/generic/lib/math/sinh.cl b/libclc/generic/lib/math/sinh.cl
index 742aa6124f6a4..48aa1ee756190 100644
--- a/libclc/generic/lib/math/sinh.cl
+++ b/libclc/generic/lib/math/sinh.cl
@@ -189,3 +189,11 @@ _CLC_OVERLOAD _CLC_DEF double sinh(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sinh)
+
+#endif
diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl
index e26aaf3382c74..059f912507e66 100644
--- a/libclc/generic/lib/math/sinpi.cl
+++ b/libclc/generic/lib/math/sinpi.cl
@@ -43,3 +43,11 @@ _CLC_OVERLOAD _CLC_DEF double sinpi(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(sinpi)
+
+#endif
diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl
index 95a07fe5ac6b4..d9509c57b0507 100644
--- a/libclc/generic/lib/math/tanh.cl
+++ b/libclc/generic/lib/math/tanh.cl
@@ -144,3 +144,11 @@ _CLC_OVERLOAD _CLC_DEF double tanh(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN_FP16(tanh)
+
+#endif
diff --git a/libclc/libspirv/include/libspirv/conversion/GenericCastToPtrExplicit.h b/libclc/libspirv/include/libspirv/conversion/GenericCastToPtrExplicit.h
new file mode 100644
index 0000000000000..2e5b954696543
--- /dev/null
+++ b/libclc/libspirv/include/libspirv/conversion/GenericCastToPtrExplicit.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define GenericCastToPtrExplicit_To(ADDRSPACE, NAME)                           \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE void *__spirv_GenericCastToPtrExplicit_To##NAME(               \
+          generic void *, int);                                                \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE const void *__spirv_GenericCastToPtrExplicit_To##NAME(         \
+          generic const void *, int);                                          \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE volatile void *__spirv_GenericCastToPtrExplicit_To##NAME(      \
+          generic volatile void *, int);                                       \
+  _CLC_DECL _CLC_OVERLOAD ADDRSPACE const volatile void *                      \
+      __spirv_GenericCastToPtrExplicit_To##NAME(generic const volatile void *, \
+                                                int)
+
+GenericCastToPtrExplicit_To(global, Global);
+GenericCastToPtrExplicit_To(local, Local);
+GenericCastToPtrExplicit_To(private, Private);
+
+#undef GenericCastToPtrExplicit_To
diff --git a/libclc/libspirv/include/libspirv/spirv.h b/libclc/libspirv/include/libspirv/spirv.h
index e926f7d8ff7a7..657ae6a220cf8 100644
--- a/libclc/libspirv/include/libspirv/spirv.h
+++ b/libclc/libspirv/include/libspirv/spirv.h
@@ -92,4 +92,7 @@
 #include <libspirv/image/image.h>
 #include <libspirv/image/image_defines.h>
 
+/* Pointer Conversion */
+#include <libspirv/conversion/GenericCastToPtrExplicit.h>
+
 #pragma OPENCL EXTENSION all : disable
diff --git a/libclc/libspirv/include/libspirv/spirv_builtins.h b/libclc/libspirv/include/libspirv/spirv_builtins.h
index 1b1aa983cc320..8a3f9070a1aee 100644
--- a/libclc/libspirv/include/libspirv/spirv_builtins.h
+++ b/libclc/libspirv/include/libspirv/spirv_builtins.h
@@ -16,6 +16,14 @@
 #ifndef CLC_SPIRV_BINDING
 #define CLC_SPIRV_BINDING
 
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
 _CLC_OVERLOAD
 _CLC_DECL _CLC_CONSTFN __clc_bool_t __spirv_All(__clc_vec2_char_t);
 _CLC_OVERLOAD
diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/SOURCES b/libclc/libspirv/lib/amdgcn-amdhsa/SOURCES
index 08f39dd63c640..3665db09f6bd1 100644
--- a/libclc/libspirv/lib/amdgcn-amdhsa/SOURCES
+++ b/libclc/libspirv/lib/amdgcn-amdhsa/SOURCES
@@ -14,6 +14,7 @@ atomic/atomic_min.cl
 atomic/atomic_max.cl
 atomic/atomic_sub.cl
 atomic/atomic_store.cl
+conversion/GenericCastToPtrExplicit.cl
 synchronization/barrier.cl
 math/acos.cl
 math/acosh.cl
@@ -64,10 +65,8 @@ workitem/get_global_size.cl
 workitem/get_local_size.cl
 workitem/get_num_groups.cl
 workitem/get_max_sub_group_size.cl
-workitem/get_num_sub_groups.cl
 workitem/get_sub_group_id.cl
 workitem/get_sub_group_local_id.cl
-workitem/get_sub_group_size.cl
 misc/sub_group_shuffle.cl
 async/wait_group_events.cl
 assert/__assert_fail.ll
diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/conversion/GenericCastToPtrExplicit.cl b/libclc/libspirv/lib/amdgcn-amdhsa/conversion/GenericCastToPtrExplicit.cl
new file mode 100644
index 0000000000000..5072ffdfcf268
--- /dev/null
+++ b/libclc/libspirv/lib/amdgcn-amdhsa/conversion/GenericCastToPtrExplicit.cl
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <libspirv/spirv.h>
+
+
+_CLC_DEF static bool __clc_amdgcn_is_private(generic void *ptr) {
+  return __builtin_amdgcn_is_private(ptr);
+}
+_CLC_DEF static bool __clc_amdgcn_is_local(generic void *ptr) {
+  return __builtin_amdgcn_is_shared(ptr);  
+}
+_CLC_DEF static bool __clc_amdgcn_is_global(generic void *ptr) {
+  return !__clc_amdgcn_is_private(ptr) && !__clc_amdgcn_is_local(ptr);
+}
+
+#define GenericCastToPtrExplicit_To(ADDRSPACE, NAME)                           \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE void *__spirv_GenericCastToPtrExplicit_To##NAME(               \
+          generic void *ptr, int unused) {                                     \
+    if (__clc_amdgcn_is_##ADDRSPACE(ptr))                                      \
+      return (ADDRSPACE void *)ptr;                                            \
+    return 0;                                                                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE const void *__spirv_GenericCastToPtrExplicit_To##NAME(         \
+          generic const void *ptr, int unused) {                               \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE volatile void *__spirv_GenericCastToPtrExplicit_To##NAME(      \
+          generic volatile void *ptr, int unused) {                            \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD ADDRSPACE const volatile void                        \
+      *__spirv_GenericCastToPtrExplicit_To##NAME(                              \
+          generic const volatile void *ptr, int unused) {                      \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }
+
+GenericCastToPtrExplicit_To(global, Global)
+GenericCastToPtrExplicit_To(local, Local)
+GenericCastToPtrExplicit_To(private, Private)
diff --git a/libclc/libspirv/lib/generic/SOURCES b/libclc/libspirv/lib/generic/SOURCES
index 4f2455cc6b3ac..95e600cd17093 100644
--- a/libclc/libspirv/lib/generic/SOURCES
+++ b/libclc/libspirv/lib/generic/SOURCES
@@ -206,3 +206,5 @@ shared/vload.cl
 shared/vstore.cl
 workitem/get_global_id.cl
 workitem/get_global_size.cl
+workitem/get_num_sub_groups.cl
+workitem/get_sub_group_size.cl
diff --git a/libclc/libspirv/lib/generic/gen_core_convert.py b/libclc/libspirv/lib/generic/gen_core_convert.py
index 80c02489ce415..a5f094b7d6211 100755
--- a/libclc/libspirv/lib/generic/gen_core_convert.py
+++ b/libclc/libspirv/lib/generic/gen_core_convert.py
@@ -397,28 +397,66 @@ def generate_float_conversion(src, dst, size, mode, sat):
                 print("  {SRC}{N} abs_x = __spirv_ocl_fabs(x);".format(SRC=src, N=size))
                 print("  {SRC}{N} abs_y = __spirv_ocl_fabs(y);".format(SRC=src, N=size))
             print(
-                "  return {BOOL_CONVERT}(abs_y > abs_x) ? r:  __spirv_ocl_nextafter(r, __spirv_ocl_sign(r) * ({DST}{N})-INFINITY);".format(
+                "  {DST}{N} sel = {BOOL_CONVERT}(abs_y > abs_x) ? r:  __spirv_ocl_nextafter(r, __spirv_ocl_sign(r) * ({DST}{N})-INFINITY);".format(
                     DST=dst,
                     N=size,
                     BOOL_CONVERT=clc_core_fn_name(bool_type[dst], size=size),
                 )
             )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                dst_max = limit_max[dst]
+                # short is 16 bits signed, so the maximum value rounded to zero
+                # is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767)
+                if src == "short":
+                    dst_max = "0x1.ffcp+14"
+                print(
+                    "  return __clc_clamp(sel, ({DST}{N}){DST_MIN}, ({DST}{N}){DST_MAX});".format(
+                        DST=dst, N=size, DST_MIN=limit_min[dst], DST_MAX=dst_max
+                    )
+                )
+            else:
+                print("  return sel;")
+
         if mode == "_rtp":
             print(
-                "  return {BOOL_CONVERT}(y < x) ? r : __spirv_ocl_nextafter(r, ({DST}{N})INFINITY);".format(
+                "  {DST}{N} sel = {BOOL_CONVERT}(y < x) ? r : __spirv_ocl_nextafter(r, ({DST}{N})INFINITY);".format(
                     DST=dst,
                     N=size,
                     BOOL_CONVERT=clc_core_fn_name(bool_type[dst], size=size),
                 )
             )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                print(
+                    "  return __clc_max(sel, ({DST}{N}){DST_MIN});".format(
+                        DST=dst, N=size, DST_MIN=limit_min[dst]
+                    )
+                )
+            else:
+                print("  return sel;")
+
         if mode == "_rtn":
             print(
-                "  return {BOOL_CONVERT}(y > x) ? r : __spirv_ocl_nextafter(r, ({DST}{N})-INFINITY);".format(
+                "  {DST}{N} sel = {BOOL_CONVERT}(y > x) ? r : __spirv_ocl_nextafter(r, ({DST}{N})-INFINITY);".format(
                     DST=dst,
                     N=size,
                     BOOL_CONVERT=clc_core_fn_name(bool_type[dst], size=size),
                 )
             )
+            if dst == "half" and src in int_types and sizeof_type[src] >= 2:
+                dst_max = limit_max[dst]
+                # short is 16 bits signed, so the maximum value rounded to
+                # negative infinity is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff
+                # == 32767)
+                if src == "short":
+                    dst_max = "0x1.ffcp+14"
+                print(
+                    "  return __clc_min(sel, ({DST}{N}){DST_MAX});".format(
+                        DST=dst, N=size, DST_MAX=dst_max
+                    )
+                )
+            else:
+                print("  return sel;")
+
 
     # Footer
     print("}")
diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/workitem/get_num_sub_groups.cl b/libclc/libspirv/lib/generic/workitem/get_num_sub_groups.cl
similarity index 100%
rename from libclc/libspirv/lib/amdgcn-amdhsa/workitem/get_num_sub_groups.cl
rename to libclc/libspirv/lib/generic/workitem/get_num_sub_groups.cl
diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/workitem/get_sub_group_size.cl b/libclc/libspirv/lib/generic/workitem/get_sub_group_size.cl
similarity index 100%
rename from libclc/libspirv/lib/amdgcn-amdhsa/workitem/get_sub_group_size.cl
rename to libclc/libspirv/lib/generic/workitem/get_sub_group_size.cl
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/SOURCES b/libclc/libspirv/lib/ptx-nvidiacl/SOURCES
index e43320ff45092..cf9a5a2c51152 100644
--- a/libclc/libspirv/lib/ptx-nvidiacl/SOURCES
+++ b/libclc/libspirv/lib/ptx-nvidiacl/SOURCES
@@ -3,6 +3,7 @@ atomic/loadstore_helpers_release.ll
 atomic/loadstore_helpers_acquire.ll
 atomic/loadstore_helpers_seq_cst.ll
 cl_khr_int64_extended_atomics/minmax_helpers.ll
+conversion/GenericCastToPtrExplicit.cl
 integer/mul24.cl
 integer/mul_hi.cl
 math/acos.cl
@@ -84,10 +85,8 @@ workitem/get_local_id.cl
 workitem/get_local_size.cl
 workitem/get_max_sub_group_size.cl
 workitem/get_num_groups.cl
-workitem/get_num_sub_groups.cl
 workitem/get_sub_group_id.cl
 workitem/get_sub_group_local_id.cl
-workitem/get_sub_group_size.cl
 images/image_helpers.ll
 images/image.cl
 group/collectives_helpers.ll
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/conversion/GenericCastToPtrExplicit.cl b/libclc/libspirv/lib/ptx-nvidiacl/conversion/GenericCastToPtrExplicit.cl
new file mode 100644
index 0000000000000..eefaa8009f1c3
--- /dev/null
+++ b/libclc/libspirv/lib/ptx-nvidiacl/conversion/GenericCastToPtrExplicit.cl
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <libspirv/spirv.h>
+
+_CLC_DEF static bool __clc_nvvm_is_private(generic void *ptr) {
+  return __nvvm_isspacep_local(ptr);
+}
+_CLC_DEF static bool __clc_nvvm_is_local(generic void *ptr) {
+  return __nvvm_isspacep_shared(ptr);
+}
+_CLC_DEF static bool __clc_nvvm_is_global(generic void *ptr) {
+  return __nvvm_isspacep_global(ptr);
+}
+
+#define GenericCastToPtrExplicit_To(ADDRSPACE, NAME)                           \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE void *__spirv_GenericCastToPtrExplicit_To##NAME(               \
+          generic void *ptr, int unused) {                                     \
+    if (__clc_nvvm_is_##ADDRSPACE(ptr))                                        \
+      return (ADDRSPACE void *)ptr;                                            \
+    return 0;                                                                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE const void *__spirv_GenericCastToPtrExplicit_To##NAME(         \
+          generic const void *ptr, int unused) {                               \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD                                                      \
+      ADDRSPACE volatile void *__spirv_GenericCastToPtrExplicit_To##NAME(      \
+          generic volatile void *ptr, int unused) {                            \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }                                                                            \
+  _CLC_DECL _CLC_OVERLOAD ADDRSPACE const volatile void                        \
+      *__spirv_GenericCastToPtrExplicit_To##NAME(                              \
+          generic const volatile void *ptr, int unused) {                      \
+    return __spirv_GenericCastToPtrExplicit_To##NAME((generic void *)ptr,      \
+                                                     unused);                  \
+  }
+
+GenericCastToPtrExplicit_To(global, Global)
+GenericCastToPtrExplicit_To(local, Local)
+GenericCastToPtrExplicit_To(private, Private)
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/math/modf.cl b/libclc/libspirv/lib/ptx-nvidiacl/math/modf.cl
index bfd55b8b2d5da..0be6859af9e4a 100644
--- a/libclc/libspirv/lib/ptx-nvidiacl/math/modf.cl
+++ b/libclc/libspirv/lib/ptx-nvidiacl/math/modf.cl
@@ -46,6 +46,7 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_modf, double,
 #endif
 
 #ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __CLC_MODF(__nv_modff, float, half)
 
 _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_modf, half,
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/math/sincos.cl b/libclc/libspirv/lib/ptx-nvidiacl/math/sincos.cl
index 272a031b9ae35..05a7df8b751a6 100644
--- a/libclc/libspirv/lib/ptx-nvidiacl/math/sincos.cl
+++ b/libclc/libspirv/lib/ptx-nvidiacl/math/sincos.cl
@@ -47,6 +47,7 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_sincos,
 #endif
 
 #ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __CLC_SINCOS(__nv_sincosf, float, half)
 
 _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_sincos, half,
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_num_sub_groups.cl b/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_num_sub_groups.cl
deleted file mode 100644
index 164b1ea66a921..0000000000000
--- a/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_num_sub_groups.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <libspirv/spirv.h>
-
-_CLC_DEF _CLC_OVERLOAD uint __spirv_NumSubgroups() {
-  // sreg.nwarpid returns number of warp identifiers, not number of warps
-  // see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
-  size_t size_x = __spirv_WorkgroupSize_x();
-  size_t size_y = __spirv_WorkgroupSize_y();
-  size_t size_z = __spirv_WorkgroupSize_z();
-  uint sg_size = __spirv_SubgroupMaxSize();
-  uint linear_size = size_z * size_y * size_x;
-  return (linear_size + sg_size - 1) / sg_size;
-}
diff --git a/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_sub_group_size.cl b/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_sub_group_size.cl
deleted file mode 100644
index b12145fe6707d..0000000000000
--- a/libclc/libspirv/lib/ptx-nvidiacl/workitem/get_sub_group_size.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <libspirv/spirv.h>
-
-_CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupSize() {
-  if (__spirv_SubgroupId() != __spirv_NumSubgroups() - 1) {
-    return __spirv_SubgroupMaxSize();
-  }
-  size_t size_x = __spirv_WorkgroupSize_x();
-  size_t size_y = __spirv_WorkgroupSize_y();
-  size_t size_z = __spirv_WorkgroupSize_z();
-  uint linear_size = size_z * size_y * size_x;
-  uint uniform_groups = __spirv_NumSubgroups() - 1;
-  uint uniform_size = __spirv_SubgroupMaxSize() * uniform_groups;
-  return linear_size - uniform_size;
-}
diff --git a/llvm-spirv/lib/SPIRV/SPIRVInternal.h b/llvm-spirv/lib/SPIRV/SPIRVInternal.h
index 9a7bf4f758f84..2698067aa2637 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVInternal.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVInternal.h
@@ -369,6 +369,7 @@ const static char TranslateOCLMemScope[] = "__translate_ocl_memory_scope";
 const static char TranslateSPIRVMemOrder[] = "__translate_spirv_memory_order";
 const static char TranslateSPIRVMemScope[] = "__translate_spirv_memory_scope";
 const static char TranslateSPIRVMemFence[] = "__translate_spirv_memory_fence";
+const static char EntrypointPrefix[] = "__spirv_entry_";
 const static char ConvertHandleToImageINTEL[] = "ConvertHandleToImageINTEL";
 const static char ConvertHandleToSamplerINTEL[] = "ConvertHandleToSamplerINTEL";
 const static char ConvertHandleToSampledImageINTEL[] =
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 62764b0cb1c20..14b646a959389 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -3317,6 +3317,25 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *BF, unsigned AS) {
     return Loc->second;
 
   auto IsKernel = isKernel(BF);
+
+  if (IsKernel) {
+    // search for a previous function with the same name
+    // upgrade it to a kernel and drop this if it's found
+    for (auto &I : FuncMap) {
+      auto BFName = I.getFirst()->getName();
+      if (BF->getName() == BFName) {
+        auto *F = I.getSecond();
+        F->setCallingConv(CallingConv::SPIR_KERNEL);
+        F->setLinkage(GlobalValue::ExternalLinkage);
+        F->setDSOLocal(false);
+        F = cast<Function>(mapValue(BF, F));
+        mapFunction(BF, F);
+        transFunctionAttrs(BF, F);
+        return F;
+      }
+    }
+  }
+
   auto Linkage = IsKernel ? GlobalValue::ExternalLinkage : transLinkageType(BF);
   FunctionType *FT = cast<FunctionType>(transType(BF->getFunctionType()));
   std::string FuncName = BF->getName();
@@ -3360,56 +3379,7 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *BF, unsigned AS) {
 
   F->setCallingConv(IsKernel ? CallingConv::SPIR_KERNEL
                              : CallingConv::SPIR_FUNC);
-  if (BF->hasDecorate(DecorationReferencedIndirectlyINTEL))
-    F->addFnAttr("referenced-indirectly");
-  if (isFuncNoUnwind())
-    F->addFnAttr(Attribute::NoUnwind);
-  foreachFuncCtlMask(BF, [&](Attribute::AttrKind Attr) { F->addFnAttr(Attr); });
-
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-       ++I) {
-    auto BA = BF->getArgument(I->getArgNo());
-    mapValue(BA, &(*I));
-    setName(&(*I), BA);
-    AttributeMask IllegalAttrs =
-        AttributeFuncs::typeIncompatible(I->getType(), I->getAttributes());
-    BA->foreachAttr([&](SPIRVFuncParamAttrKind Kind) {
-      // Skip this function parameter attribute as it will translated among
-      // OpenCL metadata
-      if (Kind == FunctionParameterAttributeRuntimeAlignedINTEL)
-        return;
-      Attribute::AttrKind LLVMKind = SPIRSPIRVFuncParamAttrMap::rmap(Kind);
-      if (IllegalAttrs.contains(LLVMKind))
-        return;
-      Type *AttrTy = nullptr;
-      switch (LLVMKind) {
-      case Attribute::AttrKind::ByVal:
-      case Attribute::AttrKind::StructRet:
-        AttrTy = transType(BA->getType()->getPointerElementType());
-        break;
-      default:
-        break; // do nothing
-      }
-      // Make sure to use a correct constructor for a typed/typeless attribute
-      auto A = AttrTy ? Attribute::get(*Context, LLVMKind, AttrTy)
-                      : Attribute::get(*Context, LLVMKind);
-      I->addAttr(A);
-    });
-
-    AttrBuilder Builder(*Context);
-    SPIRVWord MaxOffset = 0;
-    if (BA->hasDecorate(DecorationMaxByteOffset, 0, &MaxOffset))
-      Builder.addDereferenceableAttr(MaxOffset);
-    SPIRVWord AlignmentBytes = 0;
-    if (BA->hasDecorate(DecorationAlignment, 0, &AlignmentBytes))
-      Builder.addAlignmentAttr(AlignmentBytes);
-    I->addAttrs(Builder);
-  }
-  BF->foreachReturnValueAttr([&](SPIRVFuncParamAttrKind Kind) {
-    if (Kind == FunctionParameterAttributeNoWrite)
-      return;
-    F->addRetAttr(SPIRSPIRVFuncParamAttrMap::rmap(Kind));
-  });
+  transFunctionAttrs(BF, F);
 
   // Creating all basic blocks before creating instructions.
   for (size_t I = 0, E = BF->getNumBasicBlock(); I != E; ++I) {
diff --git a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
index 6da82f3e3e567..d204177559d49 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
@@ -39,6 +39,7 @@
 #include "SPIRVRegularizeLLVM.h"
 #include "OCLUtil.h"
 #include "SPIRVInternal.h"
+#include "SPIRVMDWalker.h"
 #include "libSPIRV/SPIRVDebug.h"
 
 #include "llvm/ADT/StringExtras.h" // llvm::isDigit
@@ -433,65 +434,6 @@ bool SPIRVRegularizeLLVMBase::runRegularizeLLVM(Module &Module) {
   return true;
 }
 
-// This is a temporary workaround to deal with a graphics driver failure not
-// able to support the typed pointer reverse translation of
-// getelementptr i8, ptr @__spirv_Builtin* patterns. This replaces such
-// accesses with getelementptr i32, ptr @__spirv_Builtin instead.
-static void simplifyBuiltinVarAccesses(GlobalValue *GV) {
-  // IGC only supports:
-  // load GV
-  // load (addrspacecast GV)
-  // load (gep (addrspacecast GV))
-  // load (gep GV)
-  // Opaque pointers will cause the optimizer to use i8 geps, or to remove
-  // 0-index geps entirely (adding bitcasts to the result). Restore these to
-  // avoid bitcasts in the resulting IR.
-  Type *Ty = GV->getValueType();
-  Type *ScalarTy = Ty->getScalarType();
-  SmallVector<Value *, 4> Users;
-  for (auto User : GV->users()) {
-    if (auto *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->getType() != Ty)
-        Users.push_back(LI);
-    } else if (auto *GEP = dyn_cast<GEPOperator>(User)) {
-      if (GEP->getSourceElementType() != Ty)
-        Users.push_back(GEP);
-    }
-  }
-
-  Type *Int32Ty = Type::getInt32Ty(GV->getContext());
-  auto GetGep = [&](unsigned Offset,
-                    std::optional<ConstantRange> InRange = std::nullopt) {
-    llvm::ConstantRange GepInRange(llvm::APInt(32, -((signed)Offset), true),
-                                   llvm::APInt(32, Offset, true));
-    if (InRange)
-      GepInRange = *InRange;
-    return ConstantExpr::getGetElementPtr(
-        Ty, GV,
-        ArrayRef<Constant *>(
-            {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, Offset)}),
-        true, GepInRange);
-  };
-
-  const DataLayout &DL = GV->getParent()->getDataLayout();
-  for (auto *User : Users) {
-    if (auto *LI = dyn_cast<LoadInst>(User)) {
-      LI->setOperand(0, GetGep(0));
-    } else if (auto *GEP = dyn_cast<GEPOperator>(User)) {
-      APInt Offset(64, 0);
-      GEP->accumulateConstantOffset(DL, Offset);
-      APInt Index;
-      uint64_t Remainder;
-      APInt::udivrem(Offset, ScalarTy->getScalarSizeInBits() / 8, Index,
-                     Remainder);
-      assert(Remainder == 0 && "Cannot handle misaligned access to builtins");
-      GEP->replaceAllUsesWith(GetGep(Index.getZExtValue(), GEP->getInRange()));
-      if (auto *Inst = dyn_cast<Instruction>(GEP))
-        Inst->eraseFromParent();
-    }
-  }
-}
-
 namespace {
 void regularizeWithOverflowInstrinsics(StringRef MangledName, CallInst *Call,
                                        Module *M,
@@ -637,15 +579,10 @@ void prepareCacheControlsTranslation(Metadata *MD, Instruction *Inst) {
 /// Remove entities not representable by SPIR-V
 bool SPIRVRegularizeLLVMBase::regularize() {
   eraseUselessFunctions(M);
+  addKernelEntryPoint(M);
   expandSYCLTypeUsing(M);
   cleanupConversionToNonStdIntegers(M);
 
-  for (auto &GV : M->globals()) {
-    SPIRVBuiltinVariableKind Kind;
-    if (isSPIRVBuiltinVariable(&GV, &Kind))
-      simplifyBuiltinVarAccesses(&GV);
-  }
-
   // Kernels called by other kernels
   std::vector<Function *> CalledKernels;
   for (auto I = M->begin(), E = M->end(); I != E;) {
@@ -834,6 +771,69 @@ bool SPIRVRegularizeLLVMBase::regularize() {
   return true;
 }
 
+void SPIRVRegularizeLLVMBase::addKernelEntryPoint(Module *M) {
+  std::vector<Function *> Work;
+
+  // Get a list of all functions that have SPIR kernel calling conv
+  for (auto &F : *M) {
+    if (F.getCallingConv() == CallingConv::SPIR_KERNEL)
+      Work.push_back(&F);
+  }
+  for (auto &F : Work) {
+    // for declarations just make them into SPIR functions.
+    F->setCallingConv(CallingConv::SPIR_FUNC);
+    if (F->isDeclaration())
+      continue;
+
+    // Otherwise add a wrapper around the function to act as an entry point.
+    FunctionType *FType = F->getFunctionType();
+    std::string WrapName =
+        kSPIRVName::EntrypointPrefix + static_cast<std::string>(F->getName());
+    Function *WrapFn =
+        getOrCreateFunction(M, F->getReturnType(), FType->params(), WrapName);
+
+    auto *CallBB = BasicBlock::Create(M->getContext(), "", WrapFn);
+    IRBuilder<> Builder(CallBB);
+
+    Function::arg_iterator DestI = WrapFn->arg_begin();
+    for (const Argument &I : F->args()) {
+      DestI->setName(I.getName());
+      DestI++;
+    }
+    SmallVector<Value *, 1> Args;
+    for (Argument &I : WrapFn->args()) {
+      Args.emplace_back(&I);
+    }
+    auto *CI = CallInst::Create(F, ArrayRef<Value *>(Args), "", CallBB);
+    CI->setCallingConv(F->getCallingConv());
+    CI->setAttributes(F->getAttributes());
+
+    // copy over all the metadata (should it be removed from F?)
+    SmallVector<std::pair<unsigned, MDNode *>> MDs;
+    F->getAllMetadata(MDs);
+    WrapFn->setAttributes(F->getAttributes());
+    for (auto MD = MDs.begin(), End = MDs.end(); MD != End; ++MD) {
+      WrapFn->addMetadata(MD->first, *MD->second);
+    }
+    WrapFn->setCallingConv(CallingConv::SPIR_KERNEL);
+    WrapFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+
+    Builder.CreateRet(F->getReturnType()->isVoidTy() ? nullptr : CI);
+
+    // Have to find the spir-v metadata for execution mode and transfer it to
+    // the wrapper.
+    if (auto NMD = SPIRVMDWalker(*M).getNamedMD(kSPIRVMD::ExecutionMode)) {
+      while (!NMD.atEnd()) {
+        Function *MDF = nullptr;
+        auto N = NMD.nextOp(); /* execution mode MDNode */
+        N.get(MDF);
+        if (MDF == F)
+          N.M->replaceOperandWith(0, ValueAsMetadata::get(WrapFn));
+      }
+    }
+  }
+}
+
 } // namespace SPIRV
 
 INITIALIZE_PASS(SPIRVRegularizeLLVMLegacy, "spvregular",
diff --git a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.h b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.h
index 823bd612423b8..c598708516182 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.h
@@ -51,6 +51,11 @@ class SPIRVRegularizeLLVMBase {
   // Lower functions
   bool regularize();
 
+  // SPIR-V disallows functions being entrypoints and called
+  // LLVM doesn't. This adds a wrapper around the entry point
+  // that later SPIR-V writer renames.
+  void addKernelEntryPoint(Module *M);
+
   /// Some LLVM intrinsics that have no SPIR-V counterpart may be wrapped in
   /// @spirv.llvm_intrinsic_* function. During reverse translation from SPIR-V
   /// to LLVM IR we can detect this @spirv.llvm_intrinsic_* function and
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index ea69fff7e8f06..91e7164b28a2f 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -906,13 +906,19 @@ SPIRVFunction *LLVMToSPIRVBase::transFunctionDecl(Function *F) {
       static_cast<SPIRVFunction *>(mapValue(F, BM->addFunction(BFT)));
   BF->setFunctionControlMask(transFunctionControlMask(F));
   if (F->hasName()) {
-    if (isUniformGroupOperation(F))
-      BM->getErrorLog().checkError(
-          BM->isAllowedToUseExtension(
-              ExtensionID::SPV_KHR_uniform_group_instructions),
-          SPIRVEC_RequiresExtension, "SPV_KHR_uniform_group_instructions\n");
-
-    BM->setName(BF, F->getName().str());
+    if (isKernel(F)) {
+      /* strip the prefix as the runtime will be looking for this name */
+      std::string Prefix = kSPIRVName::EntrypointPrefix;
+      std::string Name = F->getName().str();
+      BM->setName(BF, Name.substr(Prefix.size()));
+    } else {
+      if (isUniformGroupOperation(F))
+        BM->getErrorLog().checkError(
+            BM->isAllowedToUseExtension(
+                ExtensionID::SPV_KHR_uniform_group_instructions),
+            SPIRVEC_RequiresExtension, "SPV_KHR_uniform_group_instructions\n");
+      BM->setName(BF, F->getName().str());
+    }
   }
   if (!isKernel(F) && F->getLinkage() != GlobalValue::InternalLinkage)
     BF->setLinkageType(transLinkageType(F));
@@ -5911,7 +5917,7 @@ void LLVMToSPIRVBase::transFunction(Function *I) {
 
   if (isKernel(I)) {
     auto Interface = collectEntryPointInterfaces(BF, I);
-    BM->addEntryPoint(ExecutionModelKernel, BF->getId(), I->getName().str(),
+    BM->addEntryPoint(ExecutionModelKernel, BF->getId(), BF->getName(),
                       Interface);
   }
 }
@@ -6278,8 +6284,9 @@ bool LLVMToSPIRVBase::transMetadata() {
 // Work around to translate kernel_arg_type and kernel_arg_type_qual metadata
 static void transKernelArgTypeMD(SPIRVModule *BM, Function *F, MDNode *MD,
                                  std::string MDName) {
-  std::string KernelArgTypesMDStr =
-      std::string(MDName) + "." + F->getName().str() + ".";
+  std::string Prefix = kSPIRVName::EntrypointPrefix;
+  std::string Name = F->getName().str().substr(Prefix.size());
+  std::string KernelArgTypesMDStr = std::string(MDName) + "." + Name + ".";
   for (const auto &TyOp : MD->operands())
     KernelArgTypesMDStr += cast<MDString>(TyOp)->getString().str() + ",";
   BM->getString(KernelArgTypesMDStr);
diff --git a/llvm-spirv/test/entry_point_func.ll b/llvm-spirv/test/entry_point_func.ll
new file mode 100644
index 0000000000000..4c8feebbaee30
--- /dev/null
+++ b/llvm-spirv/test/entry_point_func.ll
@@ -0,0 +1,68 @@
+;; Test to check that an LLVM spir_kernel gets translated into an
+;; Entrypoint wrapper and Function with LinkageAttributes
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @testfunction() {
+   ret void
+}
+
+define spir_kernel void @callerfunction() {
+   call spir_kernel void @testfunction()
+   call spir_kernel void @testdeclaration()
+   ret void
+}
+
+declare spir_kernel void @testdeclaration()
+
+; Check there is an entrypoint and a function produced.
+; CHECK-SPIRV: EntryPoint 6 [[#TestEn:]] "testfunction"
+; CHECK-SPIRV: EntryPoint 6 [[#CallerEn:]] "callerfunction"
+; CHECK-SPIRV: Name [[#TestDecl:]] "testdeclaration"
+; CHECK-SPIRV: Name [[#TestFn:]] "testfunction"
+; CHECK-SPIRV: Name [[#CallerFn:]] "callerfunction"
+; CHECK-SPIRV: Decorate [[#TestDecl]] LinkageAttributes "testdeclaration" Import
+; CHECK-SPIRV: Decorate [[#TestFn]] LinkageAttributes "testfunction" Export
+; CHECK-SPIRV: Decorate [[#CallerFn]] LinkageAttributes "callerfunction" Export
+
+; CHECK-SPIRV: Function [[#]] [[#TestDecl]] [[#]] [[#]]
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: FunctionEnd
+
+; CHECK-SPIRV: Function [[#]] [[#TestFn]] [[#]] [[#]]
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: Label
+; CHECK-SPIRV-NEXT: Return
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: FunctionEnd
+
+; CHECK-SPIRV: Function [[#]] [[#CallerFn]] [[#]] [[#]]
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: Label
+; CHECK-SPIRV-NEXT: FunctionCall [[#]] [[#]] [[#TestFn]]
+; CHECK-SPIRV-NEXT: FunctionCall [[#]] [[#]] [[#TestDecl]]
+; CHECK-SPIRV-NEXT: Return
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: FunctionEnd
+
+
+; CHECK-SPIRV: Function [[#]] [[#TestEn]] [[#]] [[#]]
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: Label
+; CHECK-SPIRV-NEXT: FunctionCall [[#]] [[#]] [[#TestFn]]
+; CHECK-SPIRV-NEXT: Return
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: FunctionEnd
+
+; CHECK-SPIRV: Function [[#]] [[#CallerEn]] [[#]] [[#]]
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: Label
+; CHECK-SPIRV-NEXT: FunctionCall [[#]] [[#]] [[#CallerFn]]
+; CHECK-SPIRV-NEXT: Return
+; CHECK-SPIRV-EMPTY:
+; CHECK-SPIRV-NEXT: FunctionEnd
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/multiple-decoration-single-arg.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/multiple-decoration-single-arg.ll
index fcc7c718fa5db..fe1aef8f72958 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/multiple-decoration-single-arg.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/multiple-decoration-single-arg.ll
@@ -3,8 +3,8 @@
 ; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_cache_controls %t.bc -o %t.spv
 ; RUN: llvm-spirv -r %t.spv --spirv-target-env=SPV-IR -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[#Func:]] "test"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[#FuncGEP:]] "test_gep"
+; CHECK-SPIRV-DAG: Name [[#Func:]] "test"
+; CHECK-SPIRV-DAG: Name [[#FuncGEP:]] "test_gep"
 ; CHECK-SPIRV-DAG: TypeInt [[#Int32:]] 32 0
 ; CHECK-SPIRV-DAG: Constant [[#Int32]] [[#Zero:]] 0
 ; CHECK-SPIRV-DAG: Decorate [[#GEP1:]] CacheControlLoadINTEL 1 1
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_fpga_argument_interfaces/sycl-kernel-arg-annotation.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_fpga_argument_interfaces/sycl-kernel-arg-annotation.ll
index 4001bd8ebff70..992862a3b3aa7 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_fpga_argument_interfaces/sycl-kernel-arg-annotation.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_fpga_argument_interfaces/sycl-kernel-arg-annotation.ll
@@ -53,6 +53,7 @@ entry:
 ; CHECK-SPIRV: Capability FPGAArgumentInterfacesINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_argument_interfaces"
 ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_buffer_location"
+; CHECK-SPIRV-DAG:  Name [[IDS:[0-9]+]] "_arg_p"
 ; CHECK-SPIRV-DAG:  Name [[ID:[0-9]+]] "_arg_p"
 ; CHECK-SPIRV:  Decorate [[ID]] Alignment 4
 ; CHECK-SPIRV:  Decorate [[ID]] MMHostInterfaceAddressWidthINTEL 32
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/alias.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/alias.ll
index 3dffc5d1f06ac..108b04ef58345 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/alias.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/alias.ll
@@ -10,11 +10,11 @@ target triple = "spir64-unknown-unknown"
 ; when used since they can't be translated directly.
 
 ; CHECK-SPIRV-DAG: Name [[#FOO:]] "foo"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[#BAR:]] "bar"
+; CHECK-SPIRV-DAG: Name [[#BAR:]] "bar"
 ; CHECK-SPIRV-DAG: Name [[#Y:]] "y"
 ; CHECK-SPIRV-DAG: Name [[#FOOPTR:]] "foo.alias"
 ; CHECK-SPIRV-DAG: Decorate [[#FOO]] LinkageAttributes "foo" Export
-; INTEL-CHECK-SPIRV-DAG: Decorate [[#BAR]] LinkageAttributes "bar" Export
+; CHECK-SPIRV-DAG: Decorate [[#BAR]] LinkageAttributes "bar" Export
 ; CHECK-SPIRV-DAG: TypeInt [[#I32:]] 32 0
 ; CHECK-SPIRV-DAG: TypeInt [[#I64:]] 64 0
 ; CHECK-SPIRV-DAG: TypeFunction [[#FOO_TYPE:]] [[#I32]] [[#I32]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/fp-from-host.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/fp-from-host.ll
index 3a9a177d9b28b..aacdcc4fbc48c 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/fp-from-host.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/fp-from-host.ll
@@ -17,7 +17,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[INT32_TYPE_ID:[0-9]+]] 32
 ; CHECK-SPIRV: TypePointer [[INT_PTR:[0-9]+]] 5 [[INT32_TYPE_ID]]
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[INT32_TYPE_ID]] [[INT32_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer-as-function-arg.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer-as-function-arg.ll
index cd9d717273f32..a933712f4d7ef 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer-as-function-arg.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer-as-function-arg.ll
@@ -33,7 +33,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9]+]] 32
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer.ll
index f4d63660f2921..bd2ceb32d4614 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/function-pointer.ll
@@ -19,7 +19,7 @@
 ;
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT_ID:[0-9]+]]
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT_ID]] [[TYPE_INT_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/non-uniform-function-pointer.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/non-uniform-function-pointer.ll
index 526f21279589c..f4e46456f964a 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/non-uniform-function-pointer.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/non-uniform-function-pointer.ll
@@ -29,7 +29,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9+]]] 32
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/select.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/select.ll
index 9e46deeaf754e..67eebd988ec77 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/select.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/CodeSectionINTEL/select.ll
@@ -6,7 +6,7 @@
 ; RUN: llvm-dis %t.r.bc -o %t.r.ll
 ; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
 
-; CHECK-SPIRV: EntryPoint [[#]] [[#KERNEL_ID:]] "_ZTS6kernel"
+; CHECK-SPIRV: Name [[#KERNEL_ID:]] "_ZTS6kernel"
 ; CHECK-SPIRV-DAG: Name [[#BAR:]] "_Z3barii"
 ; CHECK-SPIRV-DAG: Name [[#BAZ:]] "_Z3bazii"
 ; CHECK-SPIRV: TypeInt [[#INT32:]] 32
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/alias.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/alias.ll
index e2e2f90aed11a..49344084a8f7a 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/alias.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/alias.ll
@@ -1,5 +1,3 @@
-; XFAIL: *
-
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: llvm-spirv -spirv-ext=+SPV_INTEL_function_pointers -spirv-text %t.bc -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llvm-spirv -spirv-ext=+SPV_INTEL_function_pointers %t.bc -o %t.spv
@@ -12,7 +10,7 @@ target triple = "spir64-unknown-unknown"
 ; when used since they can't be translated directly.
 
 ; CHECK-SPIRV-DAG: Name [[#FOO:]] "foo"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[#BAR:]] "bar"
+; CHECK-SPIRV-DAG: Name [[#BAR:]] "bar"
 ; CHECK-SPIRV-DAG: Name [[#Y:]] "y"
 ; CHECK-SPIRV-DAG: Name [[#FOOPTR:]] "foo.alias"
 ; CHECK-SPIRV-DAG: Decorate [[#FOO]] LinkageAttributes "foo" Export
@@ -34,7 +32,7 @@ target triple = "spir64-unknown-unknown"
 
 ; CHECK-LLVM: define spir_func i32 @foo(i32 %x)
 
-; CHECK-LLVM: define spir_func void @bar(ptr %y)
+; CHECK-LLVM: define spir_kernel void @bar(ptr %y)
 ; CHECK-LLVM: [[PTRTOINT:%.*]] = ptrtoint ptr @foo to i64
 ; CHECK-LLVM: store i64 [[PTRTOINT]], ptr %y, align 8
 
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/fp-from-host.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/fp-from-host.ll
index 805be68f89dad..bc1943f02bb41 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/fp-from-host.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/fp-from-host.ll
@@ -17,7 +17,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[INT32_TYPE_ID:[0-9]+]] 32
 ; CHECK-SPIRV: TypePointer [[INT_PTR:[0-9]+]] 5 [[INT32_TYPE_ID]]
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[INT32_TYPE_ID]] [[INT32_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
index 1aba54f8a78b3..d127083425edd 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
@@ -33,7 +33,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9]+]] 32
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer.ll
index e116745ae5d56..5ba4e8f74d0bd 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/function-pointer.ll
@@ -19,7 +19,7 @@
 ;
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT_ID:[0-9]+]]
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT_ID]] [[TYPE_INT_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
index 1670f825f304a..4d744067c2a07 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
@@ -29,7 +29,7 @@
 ; CHECK-SPIRV: Capability FunctionPointersINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_function_pointers"
 ;
-; CHECK-SPIRV: EntryPoint [[#]] [[KERNEL_ID:[0-9]+]] "test"
+; CHECK-SPIRV: Name [[KERNEL_ID:[0-9]+]] "test"
 ; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9+]]] 32
 ; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
 ; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/select.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/select.ll
index 3c4c9de5bb0a9..9ee7c0283f12f 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/select.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_function_pointers/select.ll
@@ -6,7 +6,7 @@
 ; RUN: llvm-dis %t.r.bc -o %t.r.ll
 ; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
 
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[#KERNEL_ID:]] "_ZTS6kernel"
+; CHECK-SPIRV-DAG: Name [[#KERNEL_ID:]] "_ZTS6kernel"
 ; CHECK-SPIRV-DAG: Name [[#BAR:]] "_Z3barii"
 ; CHECK-SPIRV-DAG: Name [[#BAZ:]] "_Z3bazii"
 ; CHECK-SPIRV: TypeInt [[#INT32:]] 32
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_unstructured_loop_controls/FPGAUnstructuredLoopAttr.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_unstructured_loop_controls/FPGAUnstructuredLoopAttr.ll
index 0d292c7b48a7e..df154730de407 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_unstructured_loop_controls/FPGAUnstructuredLoopAttr.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_unstructured_loop_controls/FPGAUnstructuredLoopAttr.ll
@@ -9,10 +9,10 @@
 ; CHECK-SPIRV: Capability FPGALoopControlsINTEL
 ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_loop_controls"
 ; CHECK-SPIRV: Extension "SPV_INTEL_unstructured_loop_controls"
-; CHECK-SPIRV: EntryPoint [[#]] [[FOO:[0-9]+]] "foo"
-; CHECK-SPIRV: EntryPoint [[#]] [[BOO:[0-9]+]] "boo"
+; CHECK-SPIRV: Name [[FOO:[0-9]+]] "foo"
 ; CHECK-SPIRV: Name [[ENTRY_1:[0-9]+]] "entry"
 ; CHECK-SPIRV: Name [[FOR:[0-9]+]] "for.cond"
+; CHECK-SPIRV: Name [[BOO:[0-9]+]] "boo"
 ; CHECK-SPIRV: Name [[ENTRY_2:[0-9]+]] "entry"
 ; CHECK-SPIRV: Name [[WHILE:[0-9]+]] "while.body"
 
diff --git a/llvm-spirv/test/mem2reg.cl b/llvm-spirv/test/mem2reg.cl
index e5d4ad2507daa..ef1dddbf3ed21 100644
--- a/llvm-spirv/test/mem2reg.cl
+++ b/llvm-spirv/test/mem2reg.cl
@@ -1,10 +1,11 @@
 // RUN: %clang_cc1 -O0 -triple spir-unknown-unknown -cl-std=CL2.0 -x cl -disable-O0-optnone %s -emit-llvm-bc -o %t.bc
 // RUN: llvm-spirv -s %t.bc
-// RUN: llvm-dis < %t.bc | FileCheck %s --check-prefixes=CHECK,CHECK-WO
+// RUN: llvm-dis < %t.bc | FileCheck %s --check-prefixes=CHECK-WO
 // RUN: llvm-spirv -s -spirv-mem2reg %t.bc -o %t.opt.bc
-// RUN: llvm-dis < %t.opt.bc | FileCheck %s --check-prefixes=CHECK,CHECK-W
-// CHECK-LABEL: spir_kernel void @foo
+// RUN: llvm-dis < %t.opt.bc | FileCheck %s --check-prefixes=CHECK-W
+// CHECK-W-LABEL: spir_func void @foo
 // CHECK-W-NOT: alloca
+// CHECK-WO-LABEL: spir_kernel void @foo
 // CHECK-WO: alloca
 __kernel void foo(__global int *a) {
     *a = *a + 1;
diff --git a/llvm-spirv/test/transcoding/OpenCL/atomic_cmpxchg.cl b/llvm-spirv/test/transcoding/OpenCL/atomic_cmpxchg.cl
index 18dd57ae39694..aeffc836748c2 100644
--- a/llvm-spirv/test/transcoding/OpenCL/atomic_cmpxchg.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/atomic_cmpxchg.cl
@@ -22,7 +22,7 @@ __kernel void test_atomic_cmpxchg(__global int *p, int cmp, int val) {
   atomic_cmpxchg(up, ucmp, uval);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST:[0-9]+]] "test_atomic_cmpxchg"
+// CHECK-SPIRV: Name [[TEST:[0-9]+]] "test_atomic_cmpxchg"
 // CHECK-SPIRV-DAG: TypeInt [[UINT:[0-9]+]] 32 0
 // CHECK-SPIRV-TYPED-PTRS-DAG: TypePointer [[UINT_PTR:[0-9]+]] 5 [[UINT]]
 // CHECK-SPIRV-UNTYPED-PTRS-DAG: TypeUntypedPointerKHR [[UINT_PTR:[0-9]+]] 5
diff --git a/llvm-spirv/test/transcoding/OpenCL/atomic_legacy.cl b/llvm-spirv/test/transcoding/OpenCL/atomic_legacy.cl
index 93e4b5db12997..31a2cc3ef387c 100644
--- a/llvm-spirv/test/transcoding/OpenCL/atomic_legacy.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/atomic_legacy.cl
@@ -18,7 +18,7 @@ __kernel void test_legacy_atomics(__global int *p, int val) {
   atomic_add(p, val);   // from OpenCL C 1.1
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST:[0-9]+]] "test_legacy_atomics"
+// CHECK-SPIRV: Name [[TEST:[0-9]+]] "test_legacy_atomics"
 // CHECK-SPIRV-DAG: TypeInt [[UINT:[0-9]+]] 32 0
 // CHECK-SPIRV-TYPED-PTRS-DAG: TypePointer [[UINT_PTR:[0-9]+]] 5 [[UINT]]
 // CHECK-SPIRV-UNTYPED-PTRS-DAG: TypeUntypedPointerKHR [[UINT_PTR:[0-9]+]] 5
diff --git a/llvm-spirv/test/transcoding/OpenCL/atomic_work_item_fence.cl b/llvm-spirv/test/transcoding/OpenCL/atomic_work_item_fence.cl
index fd1031c574315..6cc9c14339bdb 100644
--- a/llvm-spirv/test/transcoding/OpenCL/atomic_work_item_fence.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/atomic_work_item_fence.cl
@@ -23,7 +23,7 @@ __kernel void test_mem_fence_non_const_flags(cl_mem_fence_flags flags, memory_or
   // atomic_work_item_fence(flags, order, scope);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST_CONST_FLAGS:[0-9]+]] "test_mem_fence_const_flags"
+// CHECK-SPIRV: Name [[TEST_CONST_FLAGS:[0-9]+]] "test_mem_fence_const_flags"
 // CHECK-SPIRV: TypeInt [[UINT:[0-9]+]] 32 0
 //
 // 0x0 Relaxed + 0x100 WorkgroupMemory
diff --git a/llvm-spirv/test/transcoding/OpenCL/barrier.cl b/llvm-spirv/test/transcoding/OpenCL/barrier.cl
index 3cb0040aa5151..c2ee95340c137 100644
--- a/llvm-spirv/test/transcoding/OpenCL/barrier.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/barrier.cl
@@ -28,7 +28,7 @@ __kernel void test_barrier_non_const_flags(cl_mem_fence_flags flags) {
   // barrier(flags);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
+// CHECK-SPIRV: Name [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
 // CHECK-SPIRV: TypeInt [[UINT:[0-9]+]] 32 0
 //
 // In SPIR-V, barrier is represented as OpControlBarrier [3] and OpenCL
diff --git a/llvm-spirv/test/transcoding/OpenCL/mem_fence.cl b/llvm-spirv/test/transcoding/OpenCL/mem_fence.cl
index 77945c08e00fa..4c12695904449 100644
--- a/llvm-spirv/test/transcoding/OpenCL/mem_fence.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/mem_fence.cl
@@ -34,7 +34,7 @@ __kernel void test_mem_fence_non_const_flags(cl_mem_fence_flags flags) {
   // mem_fence(flags);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST_CONST_FLAGS:[0-9]+]] "test_mem_fence_const_flags"
+// CHECK-SPIRV: Name [[TEST_CONST_FLAGS:[0-9]+]] "test_mem_fence_const_flags"
 // CHECK-SPIRV: TypeInt [[UINT:[0-9]+]] 32 0
 //
 // In SPIR-V, mem_fence is represented as OpMemoryBarrier [2] and OpenCL
diff --git a/llvm-spirv/test/transcoding/OpenCL/sub_group_barrier.cl b/llvm-spirv/test/transcoding/OpenCL/sub_group_barrier.cl
index d56db83b6d4ae..173991abd868e 100644
--- a/llvm-spirv/test/transcoding/OpenCL/sub_group_barrier.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/sub_group_barrier.cl
@@ -31,7 +31,7 @@ __kernel void test_barrier_non_const_flags(cl_mem_fence_flags flags, memory_scop
   // sub_group_barrier(flags, scope);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
+// CHECK-SPIRV: Name [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
 // CHECK-SPIRV: TypeInt [[UINT:[0-9]+]] 32 0
 //
 // In SPIR-V, barrier is represented as OpControlBarrier [2] and OpenCL
diff --git a/llvm-spirv/test/transcoding/OpenCL/work_group_barrier.cl b/llvm-spirv/test/transcoding/OpenCL/work_group_barrier.cl
index ac331a997b782..ec6c087f035a5 100644
--- a/llvm-spirv/test/transcoding/OpenCL/work_group_barrier.cl
+++ b/llvm-spirv/test/transcoding/OpenCL/work_group_barrier.cl
@@ -33,7 +33,7 @@ __kernel void test_barrier_non_const_flags(cl_mem_fence_flags flags, memory_scop
   // work_group_barrier(flags, scope);
 }
 
-// CHECK-SPIRV: EntryPoint [[#]] [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
+// CHECK-SPIRV: Name [[TEST_CONST_FLAGS:[0-9]+]] "test_barrier_const_flags"
 // CHECK-SPIRV: TypeInt [[UINT:[0-9]+]] 32 0
 //
 // In SPIR-V, barrier is represented as OpControlBarrier [2] and OpenCL
diff --git a/llvm-spirv/test/transcoding/SampledImage.cl b/llvm-spirv/test/transcoding/SampledImage.cl
index 22270fa89df5c..c37de46333bc3 100644
--- a/llvm-spirv/test/transcoding/SampledImage.cl
+++ b/llvm-spirv/test/transcoding/SampledImage.cl
@@ -27,8 +27,8 @@ void sample_kernel_int(image2d_t input, float2 coords, global int4 *results, sam
 }
 
 // CHECK-SPIRV: Capability LiteralSampler
-// CHECK-SPIRV: EntryPoint [[#]] [[sample_kernel_float:[0-9]+]] "sample_kernel_float"
-// CHECK-SPIRV: EntryPoint [[#]] [[sample_kernel_int:[0-9]+]] "sample_kernel_int"
+// CHECK-SPIRV: Name [[sample_kernel_float:[0-9]+]] "sample_kernel_float"
+// CHECK-SPIRV: Name [[sample_kernel_int:[0-9]+]] "sample_kernel_int"
 
 // CHECK-SPIRV: TypeSampler [[TypeSampler:[0-9]+]]
 // CHECK-SPIRV: TypeSampledImage [[SampledImageTy:[0-9]+]]
diff --git a/llvm-spirv/test/transcoding/enqueue_kernel.cl b/llvm-spirv/test/transcoding/enqueue_kernel.cl
index 54635a4a04a38..8e94f18ac890c 100644
--- a/llvm-spirv/test/transcoding/enqueue_kernel.cl
+++ b/llvm-spirv/test/transcoding/enqueue_kernel.cl
@@ -17,11 +17,11 @@
 // CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer5:[0-9]+]] "__device_side_enqueue_block_invoke_5_kernel"
 // CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global"
 // CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1"
-// CHECK-SPIRV: Name [[#InvokeFunc1:]] "__device_side_enqueue_block_invoke"
-// CHECK-SPIRV: Name [[#InvokeFunc2:]] "__device_side_enqueue_block_invoke_2"
-// CHECK-SPIRV: Name [[#InvokeFunc3:]] "__device_side_enqueue_block_invoke_3"
-// CHECK-SPIRV: Name [[#InvokeFunc4:]] "__device_side_enqueue_block_invoke_4"
-// CHECK-SPIRV: Name [[#InvokeFunc5:]] "__device_side_enqueue_block_invoke_5"
+// CHECK-SPIRV: Name [[#InvokeFunc1:]] "__device_side_enqueue_block_invoke_kernel"
+// CHECK-SPIRV: Name [[#InvokeFunc2:]] "__device_side_enqueue_block_invoke_2_kernel"
+// CHECK-SPIRV: Name [[#InvokeFunc3:]] "__device_side_enqueue_block_invoke_3_kernel"
+// CHECK-SPIRV: Name [[#InvokeFunc4:]] "__device_side_enqueue_block_invoke_4_kernel"
+// CHECK-SPIRV: Name [[#InvokeFunc5:]] "__device_side_enqueue_block_invoke_5_kernel"
 
 // CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
 // CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
@@ -65,7 +65,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-SPIRV: Bitcast [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]]
   // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] [[#]] [[#]] [[#]] [[#]]
   // CHECK-SPIRV-SAME: [[ConstInt0]] [[EventNull]] [[#]]
-  // CHECK-SPIRV-SAME: [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
+  // CHECK-SPIRV-SAME: [[#InvokeFunc1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast ptr %block to ptr addrspace(4)
   // CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = bitcast ptr addrspace(4) [[Block2]] to ptr addrspace(4)
@@ -85,7 +85,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-SPIRV: Bitcast [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]]
   // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] [[#]] [[#]] [[#]] [[#]]
   // CHECK-SPIRV-SAME: [[ConstInt2]] [[Event1]] [[Event2]]
-  // CHECK-SPIRV-SAME: [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
+  // CHECK-SPIRV-SAME: [[#InvokeFunc2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast ptr %block4 to ptr addrspace(4)
   // CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = bitcast ptr addrspace(4) [[Block3]] to ptr addrspace(4)
@@ -106,7 +106,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-SPIRV: Bitcast [[Int8PtrGenTy]] [[BlockLit3:[0-9]+]] [[BlockLit3Tmp]]
   // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] [[#]] [[#]] [[#]] [[#]]
   // CHECK-SPIRV-SAME: [[ConstInt2]] [[Event1]] [[Event2]]
-  // CHECK-SPIRV-SAME: [[BlockKer3]] [[BlockLit3]] [[ConstInt12]] [[ConstInt8]]
+  // CHECK-SPIRV-SAME: [[#InvokeFunc3]] [[BlockLit3]] [[ConstInt12]] [[ConstInt8]]
   // CHECK-SPIRV-SAME: [[LocalBuf31]]
 
   // CHECK-LLVM: [[Block0Tmp:%[0-9]+]] = addrspacecast ptr addrspace(1) @__block_literal_global to ptr addrspace(4)
@@ -129,7 +129,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-SPIRV: Bitcast [[Int8PtrGenTy]] [[BlockLit4:[0-9]+]] [[BlockLit4Tmp]]
   // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] [[#]] [[#]] [[#]] [[#]]
   // CHECK-SPIRV-SAME: [[ConstInt0]] [[#]] [[#]]
-  // CHECK-SPIRV-SAME: [[BlockKer4]] [[BlockLit4]] [[ConstInt12]] [[ConstInt8]]
+  // CHECK-SPIRV-SAME: [[#InvokeFunc4]] [[BlockLit4]] [[ConstInt12]] [[ConstInt8]]
   // CHECK-SPIRV-SAME: [[LocalBuf41]] [[LocalBuf42]] [[LocalBuf43]]
 
   // CHECK-LLVM: [[Block1Tmp:%[0-9]+]] = addrspacecast ptr addrspace(1) @__block_literal_global.1 to ptr addrspace(4)
@@ -150,7 +150,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-SPIRV: Bitcast [[Int8PtrGenTy]] [[BlockLit5:[0-9]+]]
   // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] [[#]] [[#]] [[#]] [[#]]
   // CHECK-SPIRV-SAME: [[ConstInt0]] [[#]] [[Event1]]
-  // CHECK-SPIRV-SAME: [[BlockKer5]] [[BlockLit5]] [[ConstInt20]] [[ConstInt8]]
+  // CHECK-SPIRV-SAME: [[#InvokeFunc5]] [[BlockLit5]] [[ConstInt20]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block5:%[0-9]+]] = addrspacecast ptr %block15 to ptr addrspace(4)
   // CHECK-LLVM: [[Block5Ptr:%[0-9]+]] = bitcast ptr addrspace(4) [[Block5]] to ptr addrspace(4)
diff --git a/llvm-spirv/test/transcoding/kernel_arg_type_qual.ll b/llvm-spirv/test/transcoding/kernel_arg_type_qual.ll
index 0a2d486c4263f..0824b49e559d8 100644
--- a/llvm-spirv/test/transcoding/kernel_arg_type_qual.ll
+++ b/llvm-spirv/test/transcoding/kernel_arg_type_qual.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "spir64-unknown-unknown."
 
 ; CHECK-SPIRV: String [[#]] "kernel_arg_type_qual.test.volatile,const,,"
-; CHECK-SPIRV: Name [[ARG:[0-9]+]] "g"
+; CHECK-SPIRV: Name [[ARG:1[0-9]+]] "g"
 ; CHECK-SPIRV: Decorate [[ARG]] Volatile
 ; CHECK-SPIRV-NEGATIVE-NOT: String [[#]] "kernel_arg_type_qual.test.volatile,const,,"
 
diff --git a/llvm-spirv/test/transcoding/kernel_query.ll b/llvm-spirv/test/transcoding/kernel_query.ll
index a5d3c88207f09..78015194f30d3 100644
--- a/llvm-spirv/test/transcoding/kernel_query.ll
+++ b/llvm-spirv/test/transcoding/kernel_query.ll
@@ -40,10 +40,10 @@ target triple = "spir-unknown-unknown"
 ; CHECK-SPIRV-DAG: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1"
 ; CHECK-SPIRV-DAG: Name [[BlockGlb3:[0-9]+]] "__block_literal_global.2"
 ; CHECK-SPIRV-DAG: Name [[BlockGlb4:[0-9]+]] "__block_literal_global.3"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel"
-; CHECK-SPIRV-DAG: EntryPoint [[#]] [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel"
+; CHECK-SPIRV-DAG: Name [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
+; CHECK-SPIRV-DAG: Name [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
+; CHECK-SPIRV-DAG: Name [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel"
+; CHECK-SPIRV-DAG: Name [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel"
 
 ; CHECK-LLVM: [[BlockTy:%[0-9a-z\.]+]] = type { i32, i32 }
 %1 = type <{ i32, i32 }>
diff --git a/llvm-spirv/test/transcoding/registerallocmode.ll b/llvm-spirv/test/transcoding/registerallocmode.ll
index f7b567e84502a..8c59d6be69e1e 100644
--- a/llvm-spirv/test/transcoding/registerallocmode.ll
+++ b/llvm-spirv/test/transcoding/registerallocmode.ll
@@ -4,11 +4,11 @@
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC0:]] "main_l3"
-; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC1:]] "main_l6"
-; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC2:]] "main_l9"
-; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC3:]] "main_l13"
-; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC4:]] "main_l19"
+; CHECK-SPIRV: Name [[#FUNC0:]] "main_l3"
+; CHECK-SPIRV: Name [[#FUNC1:]] "main_l6"
+; CHECK-SPIRV: Name [[#FUNC2:]] "main_l9"
+; CHECK-SPIRV: Name [[#FUNC3:]] "main_l13"
+; CHECK-SPIRV: Name [[#FUNC4:]] "main_l19"
 
 ; CHECK-SPIRV: Decorate [[#FUNC0]] UserSemantic "num-thread-per-eu 4"
 ; CHECK-SPIRV: Decorate [[#FUNC1]] UserSemantic "num-thread-per-eu 8"
@@ -19,8 +19,11 @@
 ; CHECK-LLVM: @[[FLAG0:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 4\00", section "llvm.metadata"
 ; CHECK-LLVM: @[[FLAG1:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 8\00", section "llvm.metadata"
 ; CHECK-LLVM: @[[FLAG2:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 0\00", section "llvm.metadata"
+; CHECK-LLVM: @[[FLAG3:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 4\00", section "llvm.metadata"
+; CHECK-LLVM: @[[FLAG4:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 8\00", section "llvm.metadata"
+; CHECK-LLVM: @[[FLAG5:[0-9]+]] = private unnamed_addr constant [20 x i8] c"num-thread-per-eu 0\00", section "llvm.metadata"
 
-; CHECK-LLVM: @llvm.global.annotations = appending global [3 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @main_l3, ptr @[[FLAG0]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l6, ptr @[[FLAG1]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l9, ptr @[[FLAG2]], ptr undef, i32 undef, ptr undef }], section "llvm.metadata"
+; CHECK-LLVM: @llvm.global.annotations = appending global [6 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @main_l3, ptr @[[FLAG0]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l6, ptr @[[FLAG1]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l9, ptr @[[FLAG2]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l3, ptr @[[FLAG3]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l6, ptr @[[FLAG4]], ptr undef, i32 undef, ptr undef }, { ptr, ptr, ptr, i32, ptr } { ptr @main_l9, ptr @[[FLAG5]], ptr undef, i32 undef, ptr undef }], section "llvm.metadata"
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64"
diff --git a/llvm/lib/SYCLLowerIR/SYCLConditionalCallOnDevice.cpp b/llvm/lib/SYCLLowerIR/SYCLConditionalCallOnDevice.cpp
index b226437709b93..5620b068a062e 100644
--- a/llvm/lib/SYCLLowerIR/SYCLConditionalCallOnDevice.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLConditionalCallOnDevice.cpp
@@ -53,7 +53,7 @@ SYCLConditionalCallOnDevicePass::run(Module &M, ModuleAnalysisManager &) {
     // (FAction). FAction should be a literal (i.e. not a pointer). The
     // structure of the header file ensures that there is exactly one such
     // instruction.
-    bool CallFound = false;
+    [[maybe_unused]] bool CallFound = false;
     for (Instruction &I : instructions(FCaller)) {
       if (auto *CI = dyn_cast<CallInst>(&I);
           CI && (Intrinsic::IndependentIntrinsics::not_intrinsic ==
@@ -121,8 +121,8 @@ SYCLConditionalCallOnDevicePass::run(Module &M, ModuleAnalysisManager &) {
         Args.push_back(Call->getArgOperand(I));
 
       // Create the new call instruction
-      auto *NewCall =
-          CallInst::Create(NewFCaller, Args, /*	NameStr = */ "", Call);
+      auto *NewCall = CallInst::Create(NewFCaller, Args, /*	NameStr = */ "",
+                                       Call->getIterator());
       NewCall->setCallingConv(Call->getCallingConv());
       NewCall->setDebugLoc(Call->getDebugLoc());
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 3d5fede606f9f..2386fc83fa3c9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -767,50 +767,110 @@ Constant *getOrCreateGlobalString(Module &M, StringRef Name, StringRef Value,
   });
 }
 
-static void extendSpirKernelArgs(Module &M) {
-  SmallVector<Constant *, 8> SpirKernelsMetadata;
+static bool isUnsupportedDeviceGlobal(const GlobalVariable *G) {
+  // Skip instrumenting on "__MsanKernelMetadata" etc.
+  if (G->getName().starts_with("__Msan"))
+    return true;
+  if (G->getName().starts_with("__spirv_BuiltIn"))
+    return true;
+  if (G->getName().starts_with("__usid_str"))
+    return true;
+  if (G->getAddressSpace() == kSpirOffloadLocalAS ||
+      G->getAddressSpace() == kSpirOffloadConstantAS)
+    return true;
+  return false;
+}
+
+static void instrumentSPIRModule(Module &M) {
 
   const auto &DL = M.getDataLayout();
   Type *IntptrTy = DL.getIntPtrType(M.getContext());
 
-  // SpirKernelsMetadata only saves fixed kernels, and is described by
-  // following structure:
-  //  uptr unmangled_kernel_name
-  //  uptr unmangled_kernel_name_size
-  StructType *StructTy = StructType::get(IntptrTy, IntptrTy);
-  for (Function &F : M) {
-    if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
-      continue;
+  // Instrument __MsanKernelMetadata, which records information of sanitized
+  // kernel
+  {
+    SmallVector<Constant *, 8> SpirKernelsMetadata;
+
+    // SpirKernelsMetadata only saves fixed kernels, and is described by
+    // following structure:
+    //  uptr unmangled_kernel_name
+    //  uptr unmangled_kernel_name_size
+    StructType *StructTy = StructType::get(IntptrTy, IntptrTy);
+    for (Function &F : M) {
+      if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
+        continue;
 
-    if (!F.hasFnAttribute(Attribute::SanitizeMemory) ||
-        F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
-      continue;
+      if (!F.hasFnAttribute(Attribute::SanitizeMemory) ||
+          F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+        continue;
 
-    auto KernelName = F.getName();
-    auto *KernelNameGV = getOrCreateGlobalString(M, "__msan_kernel", KernelName,
-                                                 kSpirOffloadConstantAS);
-    SpirKernelsMetadata.emplace_back(ConstantStruct::get(
-        StructTy, ConstantExpr::getPointerCast(KernelNameGV, IntptrTy),
-        ConstantInt::get(IntptrTy, KernelName.size())));
-  }
-
-  // Create global variable to record spirv kernels' information
-  ArrayType *ArrayTy = ArrayType::get(StructTy, SpirKernelsMetadata.size());
-  Constant *MetadataInitializer =
-      ConstantArray::get(ArrayTy, SpirKernelsMetadata);
-  GlobalVariable *MsanSpirKernelMetadata = new GlobalVariable(
-      M, MetadataInitializer->getType(), false, GlobalValue::AppendingLinkage,
-      MetadataInitializer, "__MsanKernelMetadata", nullptr,
-      GlobalValue::NotThreadLocal, 1);
-  MsanSpirKernelMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local);
-  // Add device global attributes
-  MsanSpirKernelMetadata->addAttribute(
-      "sycl-device-global-size", std::to_string(DL.getTypeAllocSize(ArrayTy)));
-  MsanSpirKernelMetadata->addAttribute("sycl-device-image-scope");
-  MsanSpirKernelMetadata->addAttribute("sycl-host-access", "0"); // read only
-  MsanSpirKernelMetadata->addAttribute("sycl-unique-id",
-                                       "_Z20__MsanKernelMetadata");
-  MsanSpirKernelMetadata->setDSOLocal(true);
+      auto KernelName = F.getName();
+      auto *KernelNameGV = getOrCreateGlobalString(
+          M, "__msan_kernel", KernelName, kSpirOffloadConstantAS);
+      SpirKernelsMetadata.emplace_back(ConstantStruct::get(
+          StructTy, ConstantExpr::getPointerCast(KernelNameGV, IntptrTy),
+          ConstantInt::get(IntptrTy, KernelName.size())));
+    }
+
+    // Create global variable to record spirv kernels' information
+    ArrayType *ArrayTy = ArrayType::get(StructTy, SpirKernelsMetadata.size());
+    Constant *MetadataInitializer =
+        ConstantArray::get(ArrayTy, SpirKernelsMetadata);
+    GlobalVariable *MsanSpirKernelMetadata = new GlobalVariable(
+        M, MetadataInitializer->getType(), false, GlobalValue::AppendingLinkage,
+        MetadataInitializer, "__MsanKernelMetadata", nullptr,
+        GlobalValue::NotThreadLocal, 1);
+    MsanSpirKernelMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local);
+    // Add device global attributes
+    MsanSpirKernelMetadata->addAttribute(
+        "sycl-device-global-size",
+        std::to_string(DL.getTypeAllocSize(ArrayTy)));
+    MsanSpirKernelMetadata->addAttribute("sycl-device-image-scope");
+    MsanSpirKernelMetadata->addAttribute("sycl-host-access",
+                                         "0"); // read only
+    MsanSpirKernelMetadata->addAttribute("sycl-unique-id",
+                                         "_Z20__MsanKernelMetadata");
+    MsanSpirKernelMetadata->setDSOLocal(true);
+  }
+
+  // Handle global variables:
+  //   - Skip sanitizing unsupported variables
+  //   - Instrument __MsanDeviceGlobalMetadata for device globals
+  do {
+    SmallVector<Constant *, 8> DeviceGlobalMetadata;
+
+    // Device global meta data is described by a structure
+    //  size_t device_global_size
+    //  size_t beginning address of the device global
+    StructType *StructTy = StructType::get(IntptrTy, IntptrTy);
+
+    for (auto &G : M.globals()) {
+      if (isUnsupportedDeviceGlobal(&G)) {
+        for (auto *User : G.users())
+          if (auto *Inst = dyn_cast<Instruction>(User))
+            Inst->setNoSanitizeMetadata();
+        continue;
+      }
+
+      DeviceGlobalMetadata.push_back(ConstantStruct::get(
+          StructTy,
+          ConstantInt::get(IntptrTy, DL.getTypeAllocSize(G.getValueType())),
+          ConstantExpr::getPointerCast(&G, IntptrTy)));
+    }
+
+    if (DeviceGlobalMetadata.empty())
+      break;
+
+    // Create meta data global to record device globals' information
+    ArrayType *ArrayTy = ArrayType::get(StructTy, DeviceGlobalMetadata.size());
+    Constant *MetadataInitializer =
+        ConstantArray::get(ArrayTy, DeviceGlobalMetadata);
+    GlobalVariable *MsanDeviceGlobalMetadata = new GlobalVariable(
+        M, MetadataInitializer->getType(), false, GlobalValue::AppendingLinkage,
+        MetadataInitializer, "__MsanDeviceGlobalMetadata", nullptr,
+        GlobalValue::NotThreadLocal, 1);
+    MsanDeviceGlobalMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local);
+  } while (false);
 }
 
 PreservedAnalyses MemorySanitizerPass::run(Module &M,
@@ -827,7 +887,7 @@ PreservedAnalyses MemorySanitizerPass::run(Module &M,
   }
 
   if (TargetTriple.isSPIROrSPIRV()) {
-    extendSpirKernelArgs(M);
+    instrumentSPIRModule(M);
     Modified = true;
   }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/SPIRV/instrument_device_global.ll b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/instrument_device_global.ll
new file mode 100644
index 0000000000000..39c2775a923c2
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/instrument_device_global.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -passes=msan -msan-instrumentation-with-call-threshold=0 -msan-eager-checks=1 -S | FileCheck %s
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+target triple = "spir64-unknown-unknown"
+
+@.str = external addrspace(1) constant [59 x i8]
+@__spirv_BuiltInGlobalInvocationId = external addrspace(1) constant <3 x i64>
+
+; CHECK: @__MsanDeviceGlobalMetadata
+; CHECK-NOT: @__spirv_BuiltInGlobalInvocationId
+; CHECK-SAME: @.str
diff --git a/sycl-jit/test/internalization/promote-private-non-unit-hip.ll b/sycl-jit/test/internalization/promote-private-non-unit-hip.ll
index 592987b6ae0d6..2037979b7e89f 100644
--- a/sycl-jit/test/internalization/promote-private-non-unit-hip.ll
+++ b/sycl-jit/test/internalization/promote-private-non-unit-hip.ll
@@ -1,4 +1,4 @@
-; REQUIRES: hip_amd
+; REQUIRES: hip
 ; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext \
 ; RUN: -passes=sycl-internalization -S %s | FileCheck %s
 
diff --git a/sycl-jit/test/kernel-fusion/check-failed-remapping-amdgpu.ll b/sycl-jit/test/kernel-fusion/check-failed-remapping-amdgpu.ll
index 69b9ab3b7f293..8dd7784902909 100644
--- a/sycl-jit/test/kernel-fusion/check-failed-remapping-amdgpu.ll
+++ b/sycl-jit/test/kernel-fusion/check-failed-remapping-amdgpu.ll
@@ -1,4 +1,4 @@
-; REQUIRES: hip_amd
+; REQUIRES: hip
 ; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext \
 ; RUN:   -passes=sycl-kernel-fusion -S %s | FileCheck %s
 
diff --git a/sycl-jit/test/kernel-fusion/check-remapping-amdgpu.ll b/sycl-jit/test/kernel-fusion/check-remapping-amdgpu.ll
index 52e4710fd0c2d..f142e9d89322b 100644
--- a/sycl-jit/test/kernel-fusion/check-remapping-amdgpu.ll
+++ b/sycl-jit/test/kernel-fusion/check-remapping-amdgpu.ll
@@ -1,4 +1,4 @@
-; REQUIRES: hip_amd
+; REQUIRES: hip
 ; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext \
 ; RUN:   -passes=sycl-kernel-fusion -S %s | FileCheck %s
 
diff --git a/sycl-jit/test/kernel-fusion/check-remapping-interproc-amdgpu.ll b/sycl-jit/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
index 7a214eef3b2dd..65a843935933b 100644
--- a/sycl-jit/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
+++ b/sycl-jit/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
@@ -1,4 +1,4 @@
-; REQUIRES: hip_amd
+; REQUIRES: hip
 ; RUN: opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext \
 ; RUN:   -passes=sycl-kernel-fusion -S %s | FileCheck %s
 
diff --git a/sycl-jit/test/lit.cfg.py b/sycl-jit/test/lit.cfg.py
index fffa59585ef0e..3abb6c1d1b2d6 100644
--- a/sycl-jit/test/lit.cfg.py
+++ b/sycl-jit/test/lit.cfg.py
@@ -27,4 +27,4 @@
 if "NVPTX" in config.llvm_targets_to_build:
     config.available_features.add("cuda")
 if "AMDGPU" in config.llvm_targets_to_build:
-    config.available_features.add("hip_amd")
+    config.available_features.add("hip")
diff --git a/sycl-jit/test/materializer/basic.ll b/sycl-jit/test/materializer/basic.ll
index 524322116a384..2885eb77bde78 100644
--- a/sycl-jit/test/materializer/basic.ll
+++ b/sycl-jit/test/materializer/basic.ll
@@ -1,4 +1,4 @@
-; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
+; RUN: %if hip %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
 ; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
@@ -6,7 +6,7 @@
 ; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
-; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
+; RUN: %if hip %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
 ; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse,adce -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
diff --git a/sycl-jit/test/materializer/multi_type.ll b/sycl-jit/test/materializer/multi_type.ll
index f69bd057748e0..65204e3f12e64 100644
--- a/sycl-jit/test/materializer/multi_type.ll
+++ b/sycl-jit/test/materializer/multi_type.ll
@@ -1,4 +1,4 @@
-; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
+; RUN: %if hip %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
 ; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
@@ -6,7 +6,7 @@
 ; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
-; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
+; RUN: %if hip %{ opt -load-pass-plugin %shlibdir/SYCLKernelJIT%shlibext\
 ; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse -S %s |\
 ; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
index de4a5aa14251b..526683c9cdf97 100644
--- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake
+++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -1,8 +1,7 @@
-# commit 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f
-# Merge: 9c652ffb b78cfa71
-# Author: Ross Brunton <ross@codeplay.com>
-# Date:   Thu Jan 9 17:28:00 2025 +0000
-# Merge pull request #2048 from RossBrunton/ross/refc
-#
-# Use reference counting on factories
-set(UNIFIED_RUNTIME_TAG 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f)
+# commit 9e48f543b8dd39d45563169433bb529583625dfe
+# Merge: 6a3fece6 1a1108b3
+# Author: Martin Grant <martin.morrisongrant@codeplay.com>
+# Date:   Wed Jan 15 14:33:29 2025 +0000
+#     Merge pull request #2540 from martygrant/martin/program-info-unswitch
+#     Move urProgramGetInfo success test from a switch to individual tests.
+set(UNIFIED_RUNTIME_TAG 9e48f543b8dd39d45563169433bb529583625dfe)
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_current_device.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_current_device.asciidoc
new file mode 100755
index 0000000000000..d50271681385a
--- /dev/null
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_current_device.asciidoc
@@ -0,0 +1,140 @@
+= sycl_ext_oneapi_current_device
+
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2024 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 9 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+== Overview
+
+This extension introduces additional state into SYCL in order to simplify 
+programming for developers. The extension provides a mechanism to both set and
+query the 'current' per-thread `sycl::device`. By adding the notion of a 'current'
+device, this can simplify interfaces and reduce the amount of boilerplate code
+required to write a SYCL application.
+
+Since this function relates to the environment of the calling thread,
+it is the user's responsibility to ensure that it is called by the correct thread.
+For example, it is unsafe to call this function inside of a host task, within an
+asynchronous error handler, or within other functions that may be executed
+asynchronously, since these operations are not guaranteed to execute on any
+specific thread.
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_CURRENT_DEVICE` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|Initial version of this extension.
+|===
+
+=== New free functions
+
+This extension adds the following new free functions:
+
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental::this_thread {
+
+sycl::device get_current_device();
+
+} // namespace sycl::ext::oneapi::experimental::this_thread
+----
+!====
+
+_Returns:_ The current default device for the calling host thread. If 
+`set_current_device()` has not been called by this thread, returns the
+device selected by the default device selector.
+
+_Preconditions:_ The function is called from a host thread, executing
+outside of a host task or an asynchronous error handler.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental::this_thread {
+
+void set_current_device(sycl::device dev);
+
+} // namespace sycl::ext::oneapi::experimental::this_thread
+----
+!====
+
+_Effects:_ Sets the current default device to `dev` for the calling host thread.
+
+_Preconditions:_ The function is called from a host thread, executing outside
+of a host task or an asynchronous error handler.
+
+== Issues
+. [RESOLVED] Should the current device be global or should we also support a per-thread
+   device? Answer: It should be per-thread to align with the behavior of other programming
+   models.
+. [OPEN] We want to add a default queue per device. Should this queue be in-order or out-of-order?
+   Do we want to allow the user to specify this?
\ No newline at end of file
diff --git a/sycl/include/sycl/access/access.hpp b/sycl/include/sycl/access/access.hpp
index e5c8670e7fda8..3f0049d1c9950 100644
--- a/sycl/include/sycl/access/access.hpp
+++ b/sycl/include/sycl/access/access.hpp
@@ -350,15 +350,6 @@ address_space_cast_is_possible(access::address_space Src,
 
 template <access::address_space Space, typename ElementType>
 auto static_address_cast(ElementType *Ptr) {
-  constexpr auto generic_space = access::address_space::generic_space;
-  constexpr auto global_space = access::address_space::global_space;
-  constexpr auto local_space = access::address_space::local_space;
-  constexpr auto private_space = access::address_space::private_space;
-  constexpr auto global_device =
-      access::address_space::ext_intel_global_device_space;
-  constexpr auto global_host =
-      access::address_space::ext_intel_global_host_space;
-
   constexpr auto SrcAS = deduce_AS<ElementType *>::value;
   static_assert(address_space_cast_is_possible(SrcAS, Space));
 
@@ -367,31 +358,7 @@ auto static_address_cast(ElementType *Ptr) {
 
   // Note: reinterpret_cast isn't enough for some of the casts between different
   // address spaces, use C-style cast instead.
-#if !defined(__SPIR__)
   return (dst_type)Ptr;
-#else
-  if constexpr (SrcAS != generic_space) {
-    return (dst_type)Ptr;
-  } else if constexpr (Space == global_space) {
-    return (dst_type)__spirv_GenericCastToPtr_ToGlobal(
-        Ptr, __spv::StorageClass::CrossWorkgroup);
-  } else if constexpr (Space == local_space) {
-    return (dst_type)__spirv_GenericCastToPtr_ToLocal(
-        Ptr, __spv::StorageClass::Workgroup);
-  } else if constexpr (Space == private_space) {
-    return (dst_type)__spirv_GenericCastToPtr_ToPrivate(
-        Ptr, __spv::StorageClass::Function);
-#if !defined(__ENABLE_USM_ADDR_SPACE__)
-  } else if constexpr (Space == global_device || Space == global_host) {
-    // If __ENABLE_USM_ADDR_SPACE__ isn't defined then both
-    // global_device/global_host are just aliases for global_space.
-    return (dst_type)__spirv_GenericCastToPtr_ToGlobal(
-        Ptr, __spv::StorageClass::CrossWorkgroup);
-#endif
-  } else {
-    return (dst_type)Ptr;
-  }
-#endif
 }
 
 // Previous implementation (`castAS`, used in `multi_ptr` ctors among other
@@ -427,14 +394,13 @@ auto dynamic_address_cast(ElementType *Ptr) {
 #if defined(__ENABLE_USM_ADDR_SPACE__)
     static_assert(SupressNotImplementedAssert || Space != Space,
                   "Not supported yet!");
-    return static_address_cast<Space>(Ptr);
+    return detail::static_address_cast<Space>(Ptr);
 #else
     // If __ENABLE_USM_ADDR_SPACE__ isn't defined then both
     // global_device/global_host are just aliases for global_space.
     static_assert(std::is_same_v<dst_type, ElementType *>);
     return (dst_type)Ptr;
 #endif
-#if defined(__SPIR__)
   } else if constexpr (Space == global_space) {
     return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal(
         Ptr, __spv::StorageClass::CrossWorkgroup);
@@ -449,12 +415,11 @@ auto dynamic_address_cast(ElementType *Ptr) {
                        (Space == global_device || Space == global_host)) {
     return (dst_type)__spirv_GenericCastToPtrExplicit_ToGlobal(
         Ptr, __spv::StorageClass::CrossWorkgroup);
-#endif
 #endif
   } else {
     static_assert(SupressNotImplementedAssert || Space != Space,
                   "Not supported yet!");
-    return static_address_cast<Space>(Ptr);
+    return detail::static_address_cast<Space>(Ptr);
   }
 }
 #else  // __SYCL_DEVICE_ONLY__
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 4bae5c59bb6bb..d8afc90b48d85 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -66,7 +66,7 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,
     auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
     ur_program_handle_t Program =
         detail::ProgramManager::getInstance().createURProgram(
-            **DeviceImage, Context, {Device});
+            **DeviceImage, Context, {std::move(Device)});
     return {*DeviceImage, Program};
   }
 
diff --git a/sycl/source/detail/persistent_device_code_cache.hpp b/sycl/source/detail/persistent_device_code_cache.hpp
index 48ef6e15b6fce..9346461c9229f 100644
--- a/sycl/source/detail/persistent_device_code_cache.hpp
+++ b/sycl/source/detail/persistent_device_code_cache.hpp
@@ -208,21 +208,23 @@ class PersistentDeviceCodeCache {
                                       const ur_program_handle_t &NativePrg);
 
   /* Sends message to std:cerr stream when SYCL_CACHE_TRACE environemnt is set*/
-  static void trace(const std::string &msg, std::string path = "") {
+  static void trace(const std::string &msg, const std::string &path = "") {
     static const bool traceEnabled =
         SYCLConfig<SYCL_CACHE_TRACE>::isTraceDiskCache();
     if (traceEnabled) {
-      std::replace(path.begin(), path.end(), '\\', '/');
-      std::cerr << "[Persistent Cache]: " << msg << path << std::endl;
+      auto outputPath = path;
+      std::replace(outputPath.begin(), outputPath.end(), '\\', '/');
+      std::cerr << "[Persistent Cache]: " << msg << outputPath << std::endl;
     }
   }
   static void trace_KernelCompiler(const std::string &msg,
-                                   std::string path = "") {
+                                   const std::string &path = "") {
     static const bool traceEnabled =
         SYCLConfig<SYCL_CACHE_TRACE>::isTraceKernelCompiler();
     if (traceEnabled) {
-      std::replace(path.begin(), path.end(), '\\', '/');
-      std::cerr << "[kernel_compiler Persistent Cache]: " << msg << path
+      auto outputPath = path;
+      std::replace(outputPath.begin(), outputPath.end(), '\\', '/');
+      std::cerr << "[kernel_compiler Persistent Cache]: " << msg << outputPath
                 << std::endl;
     }
   }
diff --git a/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp b/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp
index 468069e275a29..07fd78194ef71 100644
--- a/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp
+++ b/sycl/test-e2e/Adapters/enqueue-arg-order-buffer.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: hip_nvidia
 // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s
 
diff --git a/sycl/test-e2e/AddressCast/dynamic_address_cast.cpp b/sycl/test-e2e/AddressCast/dynamic_address_cast.cpp
index 441fe486564b3..3bdefc533ea5b 100644
--- a/sycl/test-e2e/AddressCast/dynamic_address_cast.cpp
+++ b/sycl/test-e2e/AddressCast/dynamic_address_cast.cpp
@@ -7,9 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // Issue with OpenCL CPU runtime implementation of OpGenericCastToPtrExplicit
-// OpGenericCastToPtr* intrinsics not implemented on AMD or NVIDIA
 // FPGA emulator affected by same issue as OpenCL CPU runtime
-// UNSUPPORTED: cpu, hip, cuda, accelerator
+// UNSUPPORTED: cpu, accelerator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/AddressSanitizer/lit.local.cfg b/sycl/test-e2e/AddressSanitizer/lit.local.cfg
index d768697d07f6d..233ba3789467e 100644
--- a/sycl/test-e2e/AddressSanitizer/lit.local.cfg
+++ b/sycl/test-e2e/AddressSanitizer/lit.local.cfg
@@ -8,5 +8,8 @@ config.substitutions.append(
     ("%force_device_asan_rt", "env UR_ENABLE_LAYERS=UR_LAYER_ASAN")
 )
 
+if "-fsanitize=memory" in config.cxx_flags:
+    config.unsupported=True
+
 # https://github.com/intel/llvm/issues/15953
 config.unsupported_features += ['gpu-intel-gen12']
diff --git a/sycl/test-e2e/Assert/assert_in_kernels_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_kernels_ndebug.cpp
index 4ca45de3d54f3..7c02b92f0aad9 100644
--- a/sycl/test-e2e/Assert/assert_in_kernels_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_kernels_ndebug.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -DNDEBUG %S/assert_in_kernels.cpp -o %t.out
+// RUN: %clangxx -fsycl %{sycl_target_opts} -DNDEBUG %S/assert_in_kernels.cpp -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 //
 // CHECK-NOT: One shouldn't see this message
diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
index 8ffea706d4f58..3b66660b8c2b5 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp
@@ -9,7 +9,7 @@
 // XFAIL: (opencl && gpu)
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/11364
 
-// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%{sycl_triple} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
+// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl %{sycl_target_opts} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
 // RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp
index 2cbc05540fa69..9e02e01681190 100644
--- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp
+++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: windows
-// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%{sycl_triple} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
+// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl %{sycl_target_opts} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
 // RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %}
 //
diff --git a/sycl/test-e2e/Assert/assert_in_one_kernel_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_one_kernel_ndebug.cpp
index 1f68c39e08438..0180ec3752ec6 100644
--- a/sycl/test-e2e/Assert/assert_in_one_kernel_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_one_kernel_ndebug.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -DNDEBUG  %S/assert_in_one_kernel.cpp -o %t.out
+// RUN: %clangxx -fsycl %{sycl_target_opts} -DNDEBUG  %S/assert_in_one_kernel.cpp -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 //
 // CHECK-NOT: from assert statement
diff --git a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp
index 731187fe4cdf7..e7f419cedf7e7 100644
--- a/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp
+++ b/sycl/test-e2e/Assert/assert_in_simultaneously_multiple_tus_one_ndebug.cpp
@@ -5,7 +5,7 @@
 // XFAIL: (opencl && gpu)
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/11364
 //
-// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%{sycl_triple} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_simultaneously_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out %threads_lib
+// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl %{sycl_target_opts} -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_simultaneously_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out %threads_lib
 // RUN: %if cpu %{ %{run} %t.out &> %t.cpu.txt ; FileCheck %s --input-file %t.cpu.txt %}
 //
 // Since this is a multi-threaded application enable memory tracking and
diff --git a/sycl/test-e2e/AtomicRef/assignment_atomic64_generic.cpp b/sycl/test-e2e/AtomicRef/assignment_atomic64_generic.cpp
index 1469e40139e07..c0bfa74ad33d5 100644
--- a/sycl/test-e2e/AtomicRef/assignment_atomic64_generic.cpp
+++ b/sycl/test-e2e/AtomicRef/assignment_atomic64_generic.cpp
@@ -2,7 +2,7 @@
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15791
 
 #include "assignment.h"
diff --git a/sycl/test-e2e/AtomicRef/exchange.cpp b/sycl/test-e2e/AtomicRef/exchange.cpp
index 0252142480c52..7e405689c65c8 100644
--- a/sycl/test-e2e/AtomicRef/exchange.cpp
+++ b/sycl/test-e2e/AtomicRef/exchange.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15791
 
 #include "exchange.h"
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
index 9c69e0cd7bf71..7d58e048519f4 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
@@ -5,11 +5,11 @@
 // + below sm_80 always uses generic impls
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 %} %s -o %t.out %{mathflags}
+// RUN: %clangxx -fsycl %{sycl_target_opts} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 %} %s -o %t.out %{mathflags}
 // RUN: %{run} %t.out
 
 // Test "new" (ABI breaking) for all platforms ( sm_80/native if CUDA )
-// RUN:  %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 %} %s -o %t2.out %{mathflags} %}
+// RUN:  %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes %{sycl_target_opts} %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 %} %s -o %t2.out %{mathflags} %}
 // RUN:  %if preview-breaking-changes-supported %{  %{run} %t2.out  %}
 
 #include "bfloat16_builtins.hpp"
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
index 6db30932609f5..719bf4709ae4c 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins_cuda_generic.cpp
@@ -7,7 +7,7 @@
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
 
 // If CUDA, test "new" again for sm_75/generic
-// RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes -fsycl-targets=%{sycl_triple}  -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75  %s -o %t3.out %{mathflags} %} %}
+// RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %clangxx -fsycl -fpreview-breaking-changes %{sycl_target_opts}  -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75  %s -o %t3.out %{mathflags} %} %}
 // RUN:  %if any-device-is-cuda %{ %if preview-breaking-changes-supported %{  %{run} %t3.out  %} %}
 
 #include "bfloat16_builtins.hpp"
diff --git a/sycl/test-e2e/Basic/built-ins.cpp b/sycl/test-e2e/Basic/built-ins.cpp
index e10cf7ba8a08a..5967e7837d505 100644
--- a/sycl/test-e2e/Basic/built-ins.cpp
+++ b/sycl/test-e2e/Basic/built-ins.cpp
@@ -5,7 +5,7 @@
 // RUN: %{run} %t_var.out | FileCheck %s
 
 // Hits an assertion and kernel page fault with AMD:
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14404
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/host-task-dependency.cpp b/sycl/test-e2e/Basic/host-task-dependency.cpp
index 817b6c46b087b..3b015051377c9 100644
--- a/sycl/test-e2e/Basic/host-task-dependency.cpp
+++ b/sycl/test-e2e/Basic/host-task-dependency.cpp
@@ -2,8 +2,7 @@
 // RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s
 //
 // TODO: Behaviour is unstable for level zero on Windows. Enable when fixed.
-// TODO: The test is sporadically fails on CUDA. Enable when fixed.
-// UNSUPPORTED: (windows && level_zero) || hip_nvidia
+// UNSUPPORTED: (windows && level_zero)
 
 #define SYCL2020_DISABLE_DEPRECATION_WARNINGS
 
diff --git a/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
index 7e30406189e29..afe3ebd0d2557 100644
--- a/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
+++ b/sycl/test-e2e/Basic/max_linear_work_group_size_props.cpp
@@ -58,17 +58,15 @@ template <size_t I> struct KernelFunctorWithMaxWGSizeProp {
   }
 };
 
-template <Variant KernelVariant, size_t I, typename PropertiesT,
-          typename KernelType>
-int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
+template <Variant KernelVariant, size_t I, typename KernelType>
+int test(queue &Q, KernelType KernelFunc) {
   constexpr size_t Dims = 1;
 
   // Positive test case: Specify local size that matches required size.
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxLinearWGSizePositive<KernelVariant, false, I>>(
-          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(I)), Props,
-          KernelFunc);
+          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(I)), KernelFunc);
     });
     Q.wait_and_throw();
   } catch (exception &E) {
@@ -81,8 +79,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<MaxLinearWGSizePositive<KernelVariant, true, I>>(
-        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(I)), Props,
-        KernelFunc);
+        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(I)), KernelFunc);
     Q.wait_and_throw();
   } catch (exception &E) {
     std::cerr
@@ -97,7 +94,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxLinearWGSizeNoLocalPositive<KernelVariant, false, I>>(
-          repeatRange<Dims>(16), Props, KernelFunc);
+          repeatRange<Dims>(16), KernelFunc);
     });
     Q.wait_and_throw();
   } catch (exception &E) {
@@ -109,7 +106,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 
   try {
     Q.parallel_for<MaxLinearWGSizeNoLocalPositive<KernelVariant, true, I>>(
-        repeatRange<Dims>(16), Props, KernelFunc);
+        repeatRange<Dims>(16), KernelFunc);
     Q.wait_and_throw();
   } catch (exception &E) {
     std::cerr << "Test case MaxLinearWGSizeNoLocalPositive shortcut failed: "
@@ -122,7 +119,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxLinearWGSizeNegative<KernelVariant, false, I>>(
-          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
           KernelFunc);
     });
     Q.wait_and_throw();
@@ -147,7 +144,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<MaxLinearWGSizeNegative<KernelVariant, true, I>>(
-        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
         KernelFunc);
     Q.wait_and_throw();
     std::cerr
@@ -174,17 +171,10 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 }
 
 template <size_t I> int test_max(queue &Q) {
-  auto Props = ext::oneapi::experimental::properties{
-      ext::oneapi::experimental::max_linear_work_group_size<I>};
-  auto KernelFunction = [](auto) {};
-
-  auto EmptyProps = ext::oneapi::experimental::properties{};
   KernelFunctorWithMaxWGSizeProp<I> KernelFunctor;
 
   int Res = 0;
-  Res += test<Variant::Function, I>(Q, Props, KernelFunction);
-  Res += test<Variant::Functor, I>(Q, EmptyProps, KernelFunctor);
-  Res += test<Variant::FunctorAndProperty, I>(Q, Props, KernelFunctor);
+  Res += test<Variant::Functor, I>(Q, KernelFunctor);
   return Res;
 }
 
diff --git a/sycl/test-e2e/Basic/max_work_group_size_props.cpp b/sycl/test-e2e/Basic/max_work_group_size_props.cpp
index 6376aa0a10392..6694cb1d35d3f 100644
--- a/sycl/test-e2e/Basic/max_work_group_size_props.cpp
+++ b/sycl/test-e2e/Basic/max_work_group_size_props.cpp
@@ -49,17 +49,15 @@ template <size_t... Is> struct KernelFunctorWithMaxWGSizeProp {
   }
 };
 
-template <Variant KernelVariant, size_t... Is, typename PropertiesT,
-          typename KernelType>
-int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
+template <Variant KernelVariant, size_t... Is, typename KernelType>
+int test(queue &Q, KernelType KernelFunc) {
   constexpr size_t Dims = sizeof...(Is);
 
   // Positive test case: Specify local size that matches required size.
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxWGSizePositive<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-          KernelFunc);
+          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     });
     Q.wait_and_throw();
   } catch (exception &E) {
@@ -71,8 +69,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<MaxWGSizePositive<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-        KernelFunc);
+        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     Q.wait_and_throw();
   } catch (exception &E) {
     std::cerr << "Test case MaxWGSizePositive shortcut failed: unexpected "
@@ -86,7 +83,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxWGSizeNoLocalPositive<KernelVariant, false, Is...>>(
-          repeatRange<Dims>(16), Props, KernelFunc);
+          repeatRange<Dims>(16), KernelFunc);
     });
     Q.wait_and_throw();
   } catch (exception &E) {
@@ -98,7 +95,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 
   try {
     Q.parallel_for<MaxWGSizeNoLocalPositive<KernelVariant, true, Is...>>(
-        repeatRange<Dims>(16), Props, KernelFunc);
+        repeatRange<Dims>(16), KernelFunc);
     Q.wait_and_throw();
   } catch (exception &E) {
     std::cerr << "Test case MaxWGSizeNoLocalPositive shortcut failed: "
@@ -111,7 +108,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<MaxWGSizeNegative<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
           KernelFunc);
     });
     Q.wait_and_throw();
@@ -134,7 +131,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<MaxWGSizeNegative<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
         KernelFunc);
     Q.wait_and_throw();
     std::cerr << "Test case MaxWGSizeNegative shortcut failed: no exception "
@@ -159,17 +156,10 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 }
 
 template <size_t... Is> int test_max(queue &Q) {
-  auto Props = ext::oneapi::experimental::properties{
-      ext::oneapi::experimental::max_work_group_size<Is...>};
-  auto KernelFunction = [](auto) {};
-
-  auto EmptyProps = ext::oneapi::experimental::properties{};
   KernelFunctorWithMaxWGSizeProp<Is...> KernelFunctor;
 
   int Res = 0;
-  Res += test<Variant::Function, Is...>(Q, Props, KernelFunction);
-  Res += test<Variant::Functor, Is...>(Q, EmptyProps, KernelFunctor);
-  Res += test<Variant::FunctorAndProperty, Is...>(Q, Props, KernelFunctor);
+  Res += test<Variant::Functor, Is...>(Q, KernelFunctor);
   return Res;
 }
 
diff --git a/sycl/test-e2e/Basic/multisource.cpp b/sycl/test-e2e/Basic/multisource.cpp
index 23c95ce2eddd3..db7b4dbf39e74 100644
--- a/sycl/test-e2e/Basic/multisource.cpp
+++ b/sycl/test-e2e/Basic/multisource.cpp
@@ -9,14 +9,14 @@
 // Separate kernel sources and host code sources
 // RUN: %{build} -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t1.fat
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t1.fat
 // RUN: %{run} %t1.fat
 
 // Multiple sources with kernel code
 // RUN: %{build} -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t2.fat
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t2.fat
 // RUN: %{run} %t2.fat
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/multisource_spv_obj.cpp b/sycl/test-e2e/Basic/multisource_spv_obj.cpp
index 0f097ce3cd5db..25ff92eda2c77 100644
--- a/sycl/test-e2e/Basic/multisource_spv_obj.cpp
+++ b/sycl/test-e2e/Basic/multisource_spv_obj.cpp
@@ -11,21 +11,21 @@
 // Separate kernel sources and host code sources
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t1.fat
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.kernel.o %t.main.o -Wno-unused-command-line-argument -o %t1.fat
 // RUN: %{run} %t1.fat
 
 // Multiple sources with kernel code
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t2.fat
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t2.fat
 // RUN: %{run} %t2.fat
 
 // Multiple sources with kernel code, mixed SPIR-V and LLVM-IR objects
 // RUN: %{build} -fsycl-device-obj=spirv -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} -fsycl-device-obj=llvmir -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t3.fat
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.init.o %t.calc.o %t.main.o -Wno-unused-command-line-argument -o %t3.fat
 // RUN: %{run} %t3.fat
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Basic/work_group_size_prop.cpp b/sycl/test-e2e/Basic/work_group_size_prop.cpp
index ac8400dcc31b8..9cf04c4d2ea66 100644
--- a/sycl/test-e2e/Basic/work_group_size_prop.cpp
+++ b/sycl/test-e2e/Basic/work_group_size_prop.cpp
@@ -45,9 +45,8 @@ template <size_t... Is> struct KernelFunctorWithWGSizeProp {
   }
 };
 
-template <Variant KernelVariant, size_t... Is, typename PropertiesT,
-          typename KernelType>
-int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
+template <Variant KernelVariant, size_t... Is, typename KernelType>
+int test(queue &Q, KernelType KernelFunc) {
   constexpr size_t Dims = sizeof...(Is);
 
   bool IsOpenCL = (Q.get_backend() == backend::opencl);
@@ -56,8 +55,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<ReqdWGSizePositiveA<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-          KernelFunc);
+          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     });
     Q.wait_and_throw();
   } catch (exception &E) {
@@ -69,8 +67,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<ReqdWGSizePositiveA<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-        KernelFunc);
+        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     Q.wait_and_throw();
   } catch (exception &E) {
     std::cerr << "Test case ReqdWGSizePositiveA shortcut failed: unexpected "
@@ -87,7 +84,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
       Q.submit([&](handler &CGH) {
         CGH.parallel_for<
             ReqdWGSizeNoLocalPositive<KernelVariant, false, Is...>>(
-            repeatRange<Dims>(16), Props, KernelFunc);
+            repeatRange<Dims>(16), KernelFunc);
       });
       Q.wait_and_throw();
     } catch (exception &E) {
@@ -99,7 +96,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 
     try {
       Q.parallel_for<ReqdWGSizeNoLocalPositive<KernelVariant, true, Is...>>(
-          repeatRange<Dims>(16), Props, KernelFunc);
+          repeatRange<Dims>(16), KernelFunc);
       Q.wait_and_throw();
     } catch (exception &E) {
       std::cerr << "Test case ReqdWGSizeNoLocalPositive shortcut failed: "
@@ -113,7 +110,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   try {
     Q.submit([&](handler &CGH) {
       CGH.parallel_for<ReqdWGSizeNegativeA<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
           KernelFunc);
     });
     Q.wait_and_throw();
@@ -137,7 +134,7 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
   // Same as above but using the queue shortcuts.
   try {
     Q.parallel_for<ReqdWGSizeNegativeA<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
         KernelFunc);
     Q.wait_and_throw();
     std::cerr << "Test case ReqdWGSizeNegativeA shortcut failed: no exception "
@@ -162,17 +159,10 @@ int test(queue &Q, PropertiesT Props, KernelType KernelFunc) {
 }
 
 template <size_t... Is> int test(queue &Q) {
-  auto Props = ext::oneapi::experimental::properties{
-      ext::oneapi::experimental::work_group_size<Is...>};
-  auto KernelFunction = [](auto) {};
-
-  auto EmptyProps = ext::oneapi::experimental::properties{};
   KernelFunctorWithWGSizeProp<Is...> KernelFunctor;
 
   int Res = 0;
-  Res += test<Variant::Function, Is...>(Q, Props, KernelFunction);
-  Res += test<Variant::Functor, Is...>(Q, EmptyProps, KernelFunctor);
-  Res += test<Variant::FunctorAndProperty, Is...>(Q, Props, KernelFunctor);
+  Res += test<Variant::Functor, Is...>(Q, KernelFunctor);
   return Res;
 }
 
diff --git a/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp b/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp
index 88f55d00aa903..6076834aac650 100644
--- a/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp
+++ b/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: arch-intel_gpu_pvc, ocloc
 
-// XFAIL: arch-intel_gpu_pvc
+// XFAIL: arch-intel_gpu_pvc && opencl && igc-dev
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401
 
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc %s -o %t.out
diff --git a/sycl/test-e2e/DeviceCodeSplit/grf.cpp b/sycl/test-e2e/DeviceCodeSplit/grf.cpp
index 1e5b085d207d6..3642483e52566 100644
--- a/sycl/test-e2e/DeviceCodeSplit/grf.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/grf.cpp
@@ -14,7 +14,7 @@
 //   compiler option
 
 // REQUIRES: arch-intel_gpu_pvc
-// XFAIL: arch-intel_gpu_pvc
+// XFAIL: arch-intel_gpu_pvc && opencl
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401
 
 // RUN: %{build} -Wno-error=deprecated-declarations -o %t1.out
diff --git a/sycl/test-e2e/DeviceGlobal/device_global_static.cpp b/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
index 363c716b9d98a..75c7fc165016d 100644
--- a/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
+++ b/sycl/test-e2e/DeviceGlobal/device_global_static.cpp
@@ -4,7 +4,7 @@
 // UNSUPPORTED: opencl && gpu
 // UNSUPPORTED-TRACKER: GSD-4287
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15329
 //
 // Tests static device_global access through device kernels.
diff --git a/sycl/test-e2e/DeviceImageDependencies/NewOffloadDriver/free_function_kernels.cpp b/sycl/test-e2e/DeviceImageDependencies/NewOffloadDriver/free_function_kernels.cpp
index 74758a837cd46..f0de42b7f13f9 100644
--- a/sycl/test-e2e/DeviceImageDependencies/NewOffloadDriver/free_function_kernels.cpp
+++ b/sycl/test-e2e/DeviceImageDependencies/NewOffloadDriver/free_function_kernels.cpp
@@ -8,7 +8,7 @@
 // UNSUPPORTED: cuda
 // UNSUPPORTED-INTENDED: Not implemented yet for Nvidia/AMD backends.
 
-// XFAIL: hip_amd
+// XFAIL: hip
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/15742
 
 #include <iostream>
diff --git a/sycl/test-e2e/DeviceImageDependencies/dynamic.cpp b/sycl/test-e2e/DeviceImageDependencies/dynamic.cpp
index 5952e4e418935..1bdaf3b1d6270 100644
--- a/sycl/test-e2e/DeviceImageDependencies/dynamic.cpp
+++ b/sycl/test-e2e/DeviceImageDependencies/dynamic.cpp
@@ -10,7 +10,7 @@
 // RUN: %clangxx %{dynamic_lib_options} %S/Inputs/b.cpp %if windows %{%T/libdevice_c.lib%} -o %T/libdevice_b.%{dynamic_lib_suffix}
 // RUN: %clangxx %{dynamic_lib_options} %S/Inputs/a.cpp %if windows %{%T/libdevice_b.lib%} -o %T/libdevice_a.%{dynamic_lib_suffix}
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -fsycl-allow-device-image-dependencies -fsycl-device-code-split=per_kernel %S/Inputs/basic.cpp -o %t.out            \
+// RUN: %clangxx -fsycl %{sycl_target_opts} -fsycl-allow-device-image-dependencies -fsycl-device-code-split=per_kernel %S/Inputs/basic.cpp -o %t.out            \
 // RUN: %if windows                                                                       \
 // RUN:   %{%T/libdevice_a.lib%}                                                          \
 // RUN: %else                                                                             \
diff --git a/sycl/test-e2e/DeviceImageDependencies/free_function_kernels.cpp b/sycl/test-e2e/DeviceImageDependencies/free_function_kernels.cpp
index 5c50f8430ad78..40862c5dc6ad9 100644
--- a/sycl/test-e2e/DeviceImageDependencies/free_function_kernels.cpp
+++ b/sycl/test-e2e/DeviceImageDependencies/free_function_kernels.cpp
@@ -7,7 +7,7 @@
 // The name mangling for free function kernels currently does not work with PTX.
 // UNSUPPORTED: cuda
 
-// XFAIL: hip_amd
+// XFAIL: hip
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/15742
 
 #include <iostream>
diff --git a/sycl/test-e2e/DeviceImageDependencies/objects.cpp b/sycl/test-e2e/DeviceImageDependencies/objects.cpp
index 17409b209781c..eea085dc9b905 100644
--- a/sycl/test-e2e/DeviceImageDependencies/objects.cpp
+++ b/sycl/test-e2e/DeviceImageDependencies/objects.cpp
@@ -6,5 +6,5 @@
 // RUN: %clangxx -fsycl %S/Inputs/b.cpp -I %S/Inputs -c -o %t_b.o
 // RUN: %clangxx -fsycl %S/Inputs/c.cpp -I %S/Inputs -c -o %t_c.o
 // RUN: %clangxx -fsycl %S/Inputs/d.cpp -I %S/Inputs -c -o %t_d.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel -fsycl-allow-device-image-dependencies %t_a.o %t_b.o %t_c.o %t_d.o %S/Inputs/basic.cpp -o %t.out
+// RUN: %clangxx -fsycl %{sycl_target_opts} -fsycl-device-code-split=per_kernel -fsycl-allow-device-image-dependencies %t_a.o %t_b.o %t_c.o %t_d.o %S/Inputs/basic.cpp -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
index b42c58181ca4c..285c62c185e12 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: arch-intel_gpu_pvc
+// UNSUPPORTED: arch-intel_gpu_pvc
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16598
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp b/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
index e052ab5acb3bf..7fd3d8eef1856 100644
--- a/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
+++ b/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
@@ -40,9 +40,8 @@ template <size_t... Is> struct KernelFunctorWithWGSizeProp {
   }
 };
 
-template <Variant KernelVariant, size_t... Is, typename PropertiesT,
-          typename KernelType>
-int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
+template <Variant KernelVariant, size_t... Is, typename KernelType>
+int test(queue &Queue, KernelType KernelFunc) {
   constexpr size_t Dims = sizeof...(Is);
 
   // Positive test case: Specify local size that matches required size.
@@ -52,15 +51,13 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
 
     add_node(Graph, Queue, [&](handler &CGH) {
       CGH.parallel_for<ReqdWGSizePositiveA<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-          KernelFunc);
+          nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     });
 
 #ifdef GRAPH_E2E_RECORD_REPLAY
     Graph.begin_recording(Queue);
     Queue.parallel_for<ReqdWGSizePositiveA<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), Props,
-        KernelFunc);
+        nd_range<Dims>(repeatRange<Dims>(8), range<Dims>(Is...)), KernelFunc);
     Graph.end_recording(Queue);
 #endif
 
@@ -83,7 +80,7 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
   try {
     add_node(GraphN, Queue, [&](handler &CGH) {
       CGH.parallel_for<ReqdWGSizeNegativeA<KernelVariant, false, Is...>>(
-          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+          nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
           KernelFunc);
     });
     auto ExecGraph = GraphN.finalize();
@@ -119,7 +116,7 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
     GraphN.begin_recording(Queue);
 
     Queue.parallel_for<ReqdWGSizeNegativeA<KernelVariant, true, Is...>>(
-        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)), Props,
+        nd_range<Dims>(repeatRange<Dims>(16), repeatRange<Dims>(8)),
         KernelFunc);
 
     GraphN.end_recording(Queue);
@@ -156,17 +153,10 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
 }
 
 template <size_t... Is> int test(queue &Queue) {
-  auto Props = sycl::ext::oneapi::experimental::properties{
-      sycl::ext::oneapi::experimental::work_group_size<Is...>};
-  auto KernelFunction = [](auto) {};
-
-  auto EmptyProps = sycl::ext::oneapi::experimental::properties{};
   KernelFunctorWithWGSizeProp<Is...> KernelFunctor;
 
   int Res = 0;
-  Res += test<Variant::Function, Is...>(Queue, Props, KernelFunction);
-  Res += test<Variant::Functor, Is...>(Queue, EmptyProps, KernelFunctor);
-  Res += test<Variant::FunctorAndProperty, Is...>(Queue, Props, KernelFunctor);
+  Res += test<Variant::Functor, Is...>(Queue, KernelFunctor);
   return Res;
 }
 
diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp
index 4d6aa6445cd0e..194d098aec8c8 100644
--- a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp
+++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp
@@ -20,9 +20,11 @@ int main() {
 
   // Use a large N to try and make the kernel slow
   const size_t N = 1 << 16;
-  // Loop inside kernel to make even slower (too large N runs out of memory)
-  const size_t NumKernelLoops = 4;
-  const size_t NumSubmitLoops = 8;
+
+  // Reduce amount of work compared to version of test without free functions
+  // due to CMPLRLLVM-64841
+  const size_t NumKernelLoops = 1;
+  const size_t NumSubmitLoops = 1;
 
   exp_ext::command_graph Graph{Ctxt, Queue.get_device()};
 
diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp
index fdd5ffa52fe9b..5a9de103053eb 100644
--- a/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp
+++ b/sycl/test-e2e/Graph/Update/update_with_indices_ordering.cpp
@@ -18,7 +18,7 @@ int main() {
   const size_t N = 1 << 16;
   // Loop inside kernel to make even slower (too large N runs out of memory)
   const size_t NumKernelLoops = 4;
-  const size_t NumSubmitLoops = 8;
+  const size_t NumSubmitLoops = 2;
 
   exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
 
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index 2e50634fd21c8..257b5a4e4457f 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -2,7 +2,10 @@
 // XFAIL: (opencl && !cpu && !accelerator)
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
 
-// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
+// TODO: Currently using the -Wno-deprecated-declarations flag due to issue
+// https://github.com/intel/llvm/issues/16451. Rewrite testRootGroup() amd
+// remove the flag once the issue is resolved.
+// RUN: %{build} -I . -o %t.out -Wno-deprecated-declarations %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 // Disabled temporarily while investigation into the failure is ongoing.
@@ -42,9 +45,14 @@ void testQueriesAndProperties() {
           .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
                                    kernel_queue_specific::max_num_work_groups>(
               q, wgRange, wgRange.size() * sizeof(int));
-  const auto props = sycl::ext::oneapi::experimental::properties{
-      sycl::ext::oneapi::experimental::use_root_sync};
-  q.single_task<class QueryKernel>(props, []() {});
+  struct TestKernel0 {
+    void operator()() const {}
+    auto get(sycl::ext::oneapi::experimental::properties_tag) {
+      return sycl::ext::oneapi::experimental::properties{
+          sycl::ext::oneapi::experimental::use_root_sync};
+    }
+  };
+  q.single_task<class QueryKernel>(TestKernel0{});
 
   static auto check_max_num_work_group_sync = [](auto Result) {
     static_assert(std::is_same_v<std::remove_cv_t<decltype(Result)>, size_t>,
@@ -99,6 +107,32 @@ void testRootGroup() {
   }
 }
 
+template <typename T> struct TestKernel2 {
+  T m_testResults;
+  TestKernel2(T &testResults_) : m_testResults(testResults_) {}
+  void operator()(sycl::nd_item<1> it) const {
+    const auto root = it.ext_oneapi_get_root_group();
+    if (root.leader() || root.get_local_id() == 3) {
+      m_testResults[0] = root.get_group_id() == sycl::id<1>(0);
+      m_testResults[1] = root.leader() ? root.get_local_id() == sycl::id<1>(0)
+                                       : root.get_local_id() == sycl::id<1>(3);
+      m_testResults[2] = root.get_group_range() == sycl::range<1>(1);
+      m_testResults[3] = root.get_local_range() == it.get_global_range();
+      m_testResults[4] = root.get_max_local_range() == root.get_local_range();
+      m_testResults[5] = root.get_group_linear_id() == 0;
+      m_testResults[6] =
+          root.get_local_linear_id() == root.get_local_id().get(0);
+      m_testResults[7] = root.get_group_linear_range() == 1;
+      m_testResults[8] =
+          root.get_local_linear_range() == root.get_local_range().size();
+    }
+  }
+  auto get(sycl::ext::oneapi::experimental::properties_tag) {
+    return sycl::ext::oneapi::experimental::properties{
+        sycl::ext::oneapi::experimental::use_root_sync};
+  }
+};
+
 void testRootGroupFunctions() {
   sycl::queue q;
   const auto bundle =
@@ -109,34 +143,13 @@ void testRootGroupFunctions() {
           .ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::
                                    kernel_queue_specific::max_num_work_groups>(
               q, WorkGroupSize, 0);
-  const auto props = sycl::ext::oneapi::experimental::properties{
-      sycl::ext::oneapi::experimental::use_root_sync};
-
   constexpr int testCount = 9;
   sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
   const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
   q.submit([&](sycl::handler &h) {
     sycl::accessor testResults{testResultsBuf, h};
-    h.parallel_for<class RootGroupFunctionsKernel>(
-        range, props, [=](sycl::nd_item<1> it) {
-          const auto root = it.ext_oneapi_get_root_group();
-          if (root.leader() || root.get_local_id() == 3) {
-            testResults[0] = root.get_group_id() == sycl::id<1>(0);
-            testResults[1] = root.leader()
-                                 ? root.get_local_id() == sycl::id<1>(0)
-                                 : root.get_local_id() == sycl::id<1>(3);
-            testResults[2] = root.get_group_range() == sycl::range<1>(1);
-            testResults[3] = root.get_local_range() == it.get_global_range();
-            testResults[4] =
-                root.get_max_local_range() == root.get_local_range();
-            testResults[5] = root.get_group_linear_id() == 0;
-            testResults[6] =
-                root.get_local_linear_id() == root.get_local_id().get(0);
-            testResults[7] = root.get_group_linear_range() == 1;
-            testResults[8] =
-                root.get_local_linear_range() == root.get_local_range().size();
-          }
-        });
+    h.parallel_for<class RootGroupFunctionsKernel>(range,
+                                                   TestKernel2(testResults));
   });
   sycl::host_accessor testResults{testResultsBuf};
   for (int i = 0; i < testCount; i++) {
diff --git a/sycl/test-e2e/HierPar/hier_par_wgscope.cpp b/sycl/test-e2e/HierPar/hier_par_wgscope.cpp
index e950055641770..dc709664e53e0 100644
--- a/sycl/test-e2e/HierPar/hier_par_wgscope.cpp
+++ b/sycl/test-e2e/HierPar/hier_par_wgscope.cpp
@@ -3,7 +3,7 @@
 // RUN: %{run} %t.out
 //
 // Test hangs on AMD
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 
 //==- hier_par_wgscope.cpp --- hierarchical parallelism test for WG scope---==//
 //
diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
index a1d2ec9220763..0bc3d9624f749 100644
--- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: cuda || hip_nvidia
+// UNSUPPORTED: cuda
 // REQUIRES: gpu,linux,sg-16
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
index 4a690b3088b51..a87704672680b 100644
--- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
+++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: cuda || hip_nvidia
+// UNSUPPORTED: cuda
 // REQUIRES: gpu,linux,sg-8
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
index bd30efe1b217c..a0b4945c769b5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
@@ -14,7 +14,7 @@
 
 // Waiting for the commit in IGC to be pulled into the driver to resolve the
 // test.
-// XFAIL: (!igc-dev || gpu-intel-dg2) && run-mode
+// XFAIL: gpu-intel-dg2 && run-mode
 // XFAIL-TRACKER: GSD-10510
 
 #include "common.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
index ff30d4c40f6a7..8643bc8286280 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -14,7 +14,7 @@
 
 // Waiting for the commit in IGC to be pulled into the driver to resolve the
 // test.
-// XFAIL: (!igc-dev || gpu-intel-dg2) && run-mode
+// XFAIL: gpu-intel-dg2 && run-mode
 // XFAIL-TRACKER: GSD-10510
 
 #include "common.hpp"
diff --git a/sycl/test-e2e/MemorySanitizer/check_buffer_host_ptr.cpp b/sycl/test-e2e/MemorySanitizer/check_buffer_host_ptr.cpp
deleted file mode 100644
index 4b287a8bb0063..0000000000000
--- a/sycl/test-e2e/MemorySanitizer/check_buffer_host_ptr.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// REQUIRES: linux, cpu || (gpu && level_zero)
-// RUN: %{build} %device_msan_flags -O1 -g -o %t2.out
-// RUN: %{run} not %t2.out 2>&1 | FileCheck %s
-// RUN: %{build} %device_msan_flags -O2 -g -o %t3.out
-// RUN: %{run} not %t3.out 2>&1 | FileCheck %s
-
-#include <sycl/detail/core.hpp>
-
-__attribute__((noinline)) long long foo(int data1, long long data2) {
-  return data1 + data2;
-}
-
-int main() {
-  sycl::queue q;
-  int data1[1];
-  long long data2[1];
-
-  {
-    sycl::buffer<int, 1> buf1(data1, sycl::range<1>(1));
-    sycl::buffer<long long, 1> buf2(data2, sycl::range<1>(1));
-    q.submit([&](sycl::handler &h) {
-       auto array1 = buf1.get_access<sycl::access::mode::read_write>(h);
-       auto array2 = buf2.get_access<sycl::access::mode::read_write>(h);
-       h.single_task<class MyKernel>(
-           [=]() { array1[0] = foo(array1[0], array2[0]); });
-     }).wait();
-    // CHECK: use-of-uninitialized-value
-    // CHECK: kernel <{{.*MyKernel}}>
-    // CHECK: #0 {{.*}} {{.*check_buffer_host_ptr.cpp}}:[[@LINE-4]]
-  }
-
-  return 0;
-}
diff --git a/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp b/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp
new file mode 100644
index 0000000000000..a1f676a1933ef
--- /dev/null
+++ b/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp
@@ -0,0 +1,64 @@
+// REQUIRES: linux, cpu || (gpu && level_zero)
+// RUN: %{build} %device_msan_flags -O0 -g -o %t1.out
+// RUN: %{run} %t1.out 2>&1 | FileCheck %s
+// RUN: %{build} %device_msan_flags -O2 -g -o %t2.out
+// RUN: %{run} %t2.out 2>&1 | FileCheck %s
+
+#include <sycl/detail/core.hpp>
+
+__attribute__((noinline)) int foo(int data1, int data2) {
+  return data1 + data2;
+}
+
+void check_memset(sycl::queue &q) {
+  std::cout << "check_memset" << std::endl;
+  sycl::buffer<int, 1> buf(sycl::range<1>(2));
+  const int Pattern = 0;
+
+  q.submit([&](sycl::handler &h) {
+     auto array = buf.get_access<sycl::access::mode::read_write>(h);
+     h.fill(array, Pattern);
+   }).wait();
+
+  q.submit([&](sycl::handler &h) {
+     auto array = buf.get_access<sycl::access::mode::read_write>(h);
+     h.single_task<class MyKernel1>(
+         [=]() { array[0] = foo(array[0], array[1]); });
+   }).wait();
+  std::cout << "PASS" << std::endl;
+  // CHECK-LABEL: check_memset
+  // CHECK-NOT: use-of-uninitialized-value
+  // CHECK: PASS
+}
+
+void check_memcpy(sycl::queue &q) {
+  std::cout << "check_memcpy" << std::endl;
+  int host[2] = {1, 2};
+  sycl::buffer<int, 1> buf1(sycl::range<1>(2));
+  sycl::buffer<int, 1> buf2(host, sycl::range<1>(2));
+
+  q.submit([&](sycl::handler &h) {
+     auto array1 = buf1.get_access<sycl::access::mode::read_write>(h);
+     auto array2 = buf2.get_access<sycl::access::mode::read_write>(h);
+     h.copy(array2, array1);
+   }).wait();
+
+  q.submit([&](sycl::handler &h) {
+     auto array = buf1.get_access<sycl::access::mode::read_write>(h);
+     h.single_task<class MyKernel2>(
+         [=]() { array[0] = foo(array[0], array[1]); });
+   }).wait();
+  std::cout << "PASS" << std::endl;
+  // CHECK-LABEL: check_memcpy
+  // CHECK-NOT: use-of-uninitialized-value
+  // CHECK: PASS
+}
+
+int main() {
+  sycl::queue q;
+
+  check_memset(q);
+  check_memcpy(q);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/MemorySanitizer/check_device_global.cpp b/sycl/test-e2e/MemorySanitizer/check_device_global.cpp
new file mode 100644
index 0000000000000..f8b47569deb9b
--- /dev/null
+++ b/sycl/test-e2e/MemorySanitizer/check_device_global.cpp
@@ -0,0 +1,58 @@
+// REQUIRES: linux, cpu || (gpu && level_zero)
+// RUN: %{build} %device_msan_flags -O0 -g -o %t1.out
+// RUN: %{run} not %t1.out 2>&1 | FileCheck %s
+// RUN: %{build} %device_msan_flags -O1 -g -o %t2.out
+// RUN: %{run} not %t2.out 2>&1 | FileCheck %s
+// RUN: %{build} %device_msan_flags -O2 -g -o %t3.out
+// RUN: %{run} not %t3.out 2>&1 | FileCheck %s
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/device_global/device_global.hpp>
+#include <sycl/usm.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+sycl::ext::oneapi::experimental::device_global<
+    int[4], decltype(properties(device_image_scope, host_access_read_write))>
+    dev_global;
+
+__attribute__((noinline)) int check(int data) { return data + 1; }
+
+int main() {
+  sycl::queue Q;
+  int *array = sycl::malloc_device<int>(4, Q);
+
+  Q.submit([&](sycl::handler &h) {
+     h.single_task<class Test1>([=]() {
+       dev_global[0] = 42;
+       array[0] = check(dev_global[1]);
+       array[1] = dev_global[1];
+     });
+   }).wait();
+
+  int val[4];
+  Q.copy(dev_global, val).wait();
+  assert(val[0] == 42);
+
+  Q.submit([&](sycl::handler &h) {
+     h.single_task<class Test2>([=]() {
+       array[0] = check(array[1]);
+       dev_global[1] = array[2]; // uninitialzed value
+     });
+   }).wait();
+
+  Q.submit([&](sycl::handler &h) {
+     h.single_task<class Test3>([=]() {
+       array[0] = dev_global[1];
+       check(array[0]);
+     });
+   }).wait();
+  // CHECK: use-of-uninitialized-value
+  // CHECK-NEXT: kernel <{{.*Test3}}>
+
+  sycl::free(array, Q);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/MemorySanitizer/lit.local.cfg b/sycl/test-e2e/MemorySanitizer/lit.local.cfg
index f9437ee4a9048..dcc385637d410 100644
--- a/sycl/test-e2e/MemorySanitizer/lit.local.cfg
+++ b/sycl/test-e2e/MemorySanitizer/lit.local.cfg
@@ -1,8 +1,10 @@
 # TRACKER: https://github.com/intel/llvm/issues/16184
-# TRACKER for PVC: https://github.com/intel/llvm/issues/16401
-#has_arch_gpu_intel_pvc = any('arch-intel_gpu_pvc' in T for T in config.sycl_dev_features.values())
-#if not has_arch_gpu_intel_pvc:
-config.unsupported_features += ['gpu']
+has_arch_gpu_intel_pvc = any('arch-intel_gpu_pvc' in T for T in config.sycl_dev_features.values())
+if not has_arch_gpu_intel_pvc:
+	config.unsupported_features += ['gpu']
+else:
+	# TRACKER for PVC + igc-dev: https://github.com/intel/llvm/issues/16401
+	config.unsupported_features += ['igc-dev']
 
 config.substitutions.append(
     ("%device_msan_flags", "-Xarch_device -fsanitize=memory")
@@ -10,3 +12,6 @@ config.substitutions.append(
 config.substitutions.append(
     ("%force_device_msan_rt", "env UR_ENABLE_LAYERS=UR_LAYER_MSAN")
 )
+
+if "-fsanitize=address" in config.cxx_flags:
+    config.unsupported=True
diff --git a/sycl/test-e2e/NewOffloadDriver/multisource.cpp b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
index cf9f518c89995..0612b54bfc23b 100644
--- a/sycl/test-e2e/NewOffloadDriver/multisource.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/multisource.cpp
@@ -11,7 +11,7 @@
 // Test with `--offload-new-driver`
 // RUN: %{build} --offload-new-driver -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %{build} --offload-new-driver -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.kernel.o %t.main.o -o %t1.fat
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t.kernel.o %t.main.o -o %t1.fat
 // RUN: %{run} %t1.fat
 
 // Multiple sources with kernel code
@@ -19,7 +19,7 @@
 // RUN: %{build} --offload-new-driver -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} --offload-new-driver -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} --offload-new-driver -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t2.fat
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t2.fat
 // RUN: %{run} %t2.fat
 
 // Multiple sources with kernel code with old-style objects
@@ -27,7 +27,7 @@
 // RUN: %{build} --no-offload-new-driver -c -o %t.init.o -DINIT_KERNEL
 // RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP
-// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t3.fat
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t.init.o %t.calc.o %t.main.o -o %t3.fat
 // RUN: %{run} %t3.fat
 
 // Multiple sources with kernel code with old-style objects in a static archive
@@ -36,7 +36,7 @@
 // RUN: %{build} --no-offload-new-driver -c -o %t.calc.o -DCALC_KERNEL
 // RUN: %{build} --no-offload-new-driver -c -o %t.main.o -DMAIN_APP
 // RUN: llvm-ar r %t.a %t.init.o %t.calc.o
-// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t.main.o %t.a -o %t4.fat
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t.main.o %t.a -o %t4.fat
 // RUN: %{run} %t4.fat
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
index 144466f673bba..b2659744d0338 100644
--- a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
+++ b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp
@@ -1,7 +1,7 @@
 // Test with `--offload-new-driver`
 // RUN: %{build} -DSOURCE1 --offload-new-driver -c -o %t1.o
 // RUN: %{build} -DSOURCE2 --offload-new-driver -c -o %t2.o
-// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl -fsycl-targets=%{sycl_triple} --offload-new-driver %t1.o %t2.o -o %t.exe
+// RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t1.o %t2.o -o %t.exe
 // RUN: %{run} %t.exe
 // XFAIL: cuda
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16413
diff --git a/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp b/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp
index 35430c7b12ff6..c929e9261623d 100644
--- a/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp
+++ b/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp
@@ -1,5 +1,5 @@
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %S/Inputs/trivial.cpp -o %t.out
+// RUN: %clangxx -fsycl %{sycl_target_opts} %S/Inputs/trivial.cpp -o %t.out
 // RUN: not --crash env ONEAPI_DEVICE_SELECTOR="macaroni:*" %{run-unfiltered-devices} %t.out
 // RUN: not --crash env ONEAPI_DEVICE_SELECTOR=":" %{run-unfiltered-devices} %t.out
 // RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:." %{run-unfiltered-devices} %t.out
diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
index 0d80e37e7d9fc..4de91a66941aa 100644
--- a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
+++ b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: level_zero, level_zero_dev_kit, cm-compiler
-// XFAIL: gpu && !(arch-intel_gpu_pvc && igc-dev)
+// XFAIL: gpu
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406
 // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %level_zero_options -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
index 360592289a969..b0023426f0631 100644
--- a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
+++ b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: opencl, opencl_icd, cm-compiler
-// XFAIL: (gpu && !(arch-intel_gpu_pvc && igc-dev)) || cpu || accelerator
+// XFAIL: gpu || cpu || accelerator
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406
 // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %opencl_lib -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
index e1616ec80dec0..c0a1cb07db1e1 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: hip_amd, opencl, gpu, cpu
+// REQUIRES: hip, opencl, gpu, cpu
 // REQUIRES: build-and-run-mode
 
 // RUN: %clangxx -fsycl -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx906 -fsycl-targets=amdgcn-amd-amdhsa %S/Inputs/is_compatible_with_env.cpp -o %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
index 5a04ea4ed55df..ec713c0ab2718 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/sycl-external-with-optional-features.cpp
@@ -1,6 +1,6 @@
 // RUN: %{build} -DSOURCE1 -c -o %t1.o
 // RUN: %{build} -DSOURCE2 -c -o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t1.o %t2.o -Wno-unused-command-line-argument -o %t.exe
 // RUN: %{run} %t.exe
 
 #ifdef SOURCE1
diff --git a/sycl/test-e2e/Printf/char.cpp b/sycl/test-e2e/Printf/char.cpp
index 550186eefea27..f409a5f8150d1 100644
--- a/sycl/test-e2e/Printf/char.cpp
+++ b/sycl/test-e2e/Printf/char.cpp
@@ -4,7 +4,7 @@
 // The test is written using conversion specifiers table from cppreference [1]
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 //
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
diff --git a/sycl/test-e2e/Printf/double.cpp b/sycl/test-e2e/Printf/double.cpp
index f7c0292fc7a21..ab756e6e83372 100644
--- a/sycl/test-e2e/Printf/double.cpp
+++ b/sycl/test-e2e/Printf/double.cpp
@@ -5,7 +5,7 @@
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
 // REQUIRES: aspect-fp64
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 //
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
diff --git a/sycl/test-e2e/Printf/float.cpp b/sycl/test-e2e/Printf/float.cpp
index 0643b06684860..070649f99d849 100644
--- a/sycl/test-e2e/Printf/float.cpp
+++ b/sycl/test-e2e/Printf/float.cpp
@@ -4,7 +4,7 @@
 // The test is written using conversion specifiers table from cppreference [1]
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 //
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
diff --git a/sycl/test-e2e/Printf/int.cpp b/sycl/test-e2e/Printf/int.cpp
index 17b3e212c5988..d87d35bbf5186 100644
--- a/sycl/test-e2e/Printf/int.cpp
+++ b/sycl/test-e2e/Printf/int.cpp
@@ -4,7 +4,7 @@
 // The test is written using conversion specifiers table from cppreference [1]
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // FIXME: The 'short' type gets overflown with sporadic values on CUDA.
 // XFAIL: cuda
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14734
diff --git a/sycl/test-e2e/Printf/mixed-address-space.cpp b/sycl/test-e2e/Printf/mixed-address-space.cpp
index d79013007ca03..72d7c009569bd 100644
--- a/sycl/test-e2e/Printf/mixed-address-space.cpp
+++ b/sycl/test-e2e/Printf/mixed-address-space.cpp
@@ -1,7 +1,7 @@
 // This test is written with an aim to check that experimental::printf versions
 // for constant and generic address space can be used in the same module.
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // XFAIL: cuda && windows
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14733
 // FIXME: Drop the test once generic AS support is considered stable and the
diff --git a/sycl/test-e2e/Printf/percent-symbol.cpp b/sycl/test-e2e/Printf/percent-symbol.cpp
index f08cd3e085d0d..ea3e1ea40a925 100644
--- a/sycl/test-e2e/Printf/percent-symbol.cpp
+++ b/sycl/test-e2e/Printf/percent-symbol.cpp
@@ -4,7 +4,7 @@
 // The test is written using conversion specifiers table from cppreference [1]
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 // XFAIL: cuda && windows
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14733
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp
index 0cda3e97a5d1f..666f85631c9b1 100644
--- a/sycl/test-e2e/Properties/cache_config.cpp
+++ b/sycl/test-e2e/Properties/cache_config.cpp
@@ -1,6 +1,9 @@
 // REQUIRES: gpu, level_zero
 
-// RUN: %{build} -o %t.out
+// TODO: Currently using the -Wno-deprecated-declarations flag due to issue
+// https://github.com/intel/llvm/issues/16320. Remove the flag once the issue is
+// resolved.
+// RUN: %{build} -o %t.out -Wno-deprecated-declarations
 // RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s
 
 #include <numeric>
@@ -36,6 +39,14 @@ struct NegativeKernelFunctor {
   auto get(properties_tag) const { return properties{}; }
 };
 
+struct RangeKernelFunctor {
+
+  RangeKernelFunctor() {}
+
+  void operator()(id<2> i) const {}
+  auto get(properties_tag) const { return properties{cache_config(large_slm)}; }
+};
+
 int main() {
   sycl::property_list q_prop{sycl::property::queue::in_order()};
   queue q{q_prop};
@@ -43,22 +54,10 @@ int main() {
   sycl::ext::oneapi::experimental::properties properties{
       cache_config(large_slm)};
 
-  // CHECK: single_task
-  // CHECK: ZE ---> zeKernelSetCacheConfig
-  std::cout << "single_task" << std::endl;
-  q.single_task(properties, [=]() {}).wait();
-
   // CHECK: parallel_for with sycl::range
   // CHECK: ZE ---> zeKernelSetCacheConfig
   std::cout << "parallel_for with sycl::range" << std::endl;
-  q.parallel_for(range<2>{16, 16}, properties, [=](id<2> i) {}).wait();
-
-  // CHECK: parallel_for with sycl::nd_range
-  // CHECK: ZE ---> zeKernelSetCacheConfig
-  std::cout << "parallel_for with sycl::nd_range" << std::endl;
-  q.parallel_for(nd_range<2>{range<2>(4, 4), range<2>(2, 2)}, properties,
-                 [=](nd_item<2> i) {})
-      .wait();
+  q.parallel_for(range<2>{16, 16}, RangeKernelFunctor{}).wait();
 
   // CHECK: parallel_for_work_group(range, func)
   // CHECK: ZE ---> zeKernelSetCacheConfig
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index 396aa9ef7341a..5d19795212305 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -67,7 +67,7 @@ is substituted with just `[Optional run_launcher if that is configured]`.
 Another little nuance is `%{sycl_triple}` substitution. It is constructed by
 concatenating triples for all the devices from `sycl_devices` supported by a
 given test. After that there is also a convenient `%{build}` substitution that
-is equivalent to `%clangxx -fsycl -fsycl-targets=%{sycl_triple} %s`.
+is equivalent to `%clangxx -fsycl %{sycl_target_opts} %s`.
 
 ## Prerequisites
 
@@ -184,12 +184,6 @@ at the full path specified by this variable.
 
 ***CUDA_LIBS_DIR*** - path to CUDA libraries.
 
-***HIP_PLATFORM*** - platform selection for HIP targeted devices.
-Defaults to AMD if no value is given. Supported values are:
-
-* **AMD**    - for HIP to target AMD GPUs
-* **NVIDIA** - for HIP to target NVIDIA GPUs
-
 ***AMD_ARCH*** - flag may be set for when using HIP AMD triple. For example it
 may be set to "gfx906". Otherwise must be provided via the ***amd_arch*** LIT
 parameter (e.g., ***--param amd_arch=gfx906***) at runtime via the command line
diff --git a/sycl/test-e2e/Regression/DAE-separate-compile.cpp b/sycl/test-e2e/Regression/DAE-separate-compile.cpp
index 64e19ec8f90ad..d787c288d64df 100644
--- a/sycl/test-e2e/Regression/DAE-separate-compile.cpp
+++ b/sycl/test-e2e/Regression/DAE-separate-compile.cpp
@@ -5,12 +5,11 @@
 // The test checks that the scenario works correctly.
 //
 // RUN: %{build} -O2 -c -o %t.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t.o %O0 -Wno-unused-command-line-argument -o %t.out
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t.o %O0 -Wno-unused-command-line-argument -o %t.out
 // RUN: %{run} %t.out
 
 // Failing on HIP AMD, enable after fixed
-// UNSUPPORTED: hip_amd
-
+// UNSUPPORTED: hip
 
 #include <iostream>
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Regression/commandlist/gpu.cpp b/sycl/test-e2e/Regression/commandlist/gpu.cpp
index 552f0f1109b1e..02db522a23943 100644
--- a/sycl/test-e2e/Regression/commandlist/gpu.cpp
+++ b/sycl/test-e2e/Regression/commandlist/gpu.cpp
@@ -1,4 +1,4 @@
 // REQUIRES: gpu
 
-// RUN: %clangxx -Wno-error=vla-cxx-extension -fsycl -fsycl-targets=%{sycl_triple} %S/Inputs/FindPrimesSYCL.cpp %S/Inputs/main.cpp -o %t.out %threads_lib
+// RUN: %clangxx -Wno-error=vla-cxx-extension -fsycl %{sycl_target_opts} %S/Inputs/FindPrimesSYCL.cpp %S/Inputs/main.cpp -o %t.out %threads_lib
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Regression/local-arg-align.cpp b/sycl/test-e2e/Regression/local-arg-align.cpp
index 4eca3aeff7f84..3959570a3e44f 100644
--- a/sycl/test-e2e/Regression/local-arg-align.cpp
+++ b/sycl/test-e2e/Regression/local-arg-align.cpp
@@ -2,7 +2,7 @@
 //
 // RUN: %{run} %t.out
 
-// UNSUPPORTED: true
+// UNSUPPORTED: system-windows
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/10682
 
 //==-- local-arg-align.cpp - Test for local argument alignmnent ------------==//
diff --git a/sycl/test-e2e/Regression/multiple-targets.cpp b/sycl/test-e2e/Regression/multiple-targets.cpp
index a2498c3301b99..aa8c125d90738 100644
--- a/sycl/test-e2e/Regression/multiple-targets.cpp
+++ b/sycl/test-e2e/Regression/multiple-targets.cpp
@@ -4,16 +4,16 @@
 //
 // REQUIRES: cuda || hip || native_cpu
 // REQUIRES: build-and-run-mode
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 -o %t1.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-device-is-hip %{ %{hip_arch_opts} %} -o %t1.out %s
 // RUN: %{run} %t1.out
 //
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} -o %t2.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-device-is-hip %{ %{hip_arch_opts} %} -o %t2.out %s
 // RUN: %{run} %t2.out
 //
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 -fsycl-device-code-split=per_kernel -o %t3.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-device-is-hip %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t3.out %s
 // RUN: %{run} %t3.out
 //
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} -fsycl-device-code-split=per_kernel -o %t4.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-device-is-hip %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t4.out %s
 // RUN: %{run} %t4.out
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Regression/multithread_write_accessor.cpp b/sycl/test-e2e/Regression/multithread_write_accessor.cpp
index 87299ed3e4d5c..b1d927517079c 100644
--- a/sycl/test-e2e/Regression/multithread_write_accessor.cpp
+++ b/sycl/test-e2e/Regression/multithread_write_accessor.cpp
@@ -1,7 +1,7 @@
 // RUN: %{build} -o %t.out %threads_lib
 // RUN: %{run} %t.out
 
-// XFAIL: arch-intel_gpu_pvc
+// XFAIL: arch-intel_gpu_pvc && opencl
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Regression/static-buffer-dtor.cpp b/sycl/test-e2e/Regression/static-buffer-dtor.cpp
index 8ff9328d6535d..e84d3a062978b 100644
--- a/sycl/test-e2e/Regression/static-buffer-dtor.cpp
+++ b/sycl/test-e2e/Regression/static-buffer-dtor.cpp
@@ -13,7 +13,7 @@
 // RUN: %{run} %t.out
 
 // Failing on HIP AMD
-// UNSUPPORTED: hip_amd
+// UNSUPPORTED: hip
 
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/Sampler/normalized-clampedge-nearest.cpp b/sycl/test-e2e/Sampler/normalized-clampedge-nearest.cpp
index 6f349254dda55..b1f13e0b63140 100644
--- a/sycl/test-e2e/Sampler/normalized-clampedge-nearest.cpp
+++ b/sycl/test-e2e/Sampler/normalized-clampedge-nearest.cpp
@@ -4,7 +4,7 @@
 //
 // Missing __spirv_ImageWrite, __spirv_SampledImage,
 // __spirv_ImageSampleExplicitLod on AMD
-// XFAIL: hip_amd
+// XFAIL: hip
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/14732
 
 /*
diff --git a/sycl/test-e2e/SeparateCompile/same-kernel.cpp b/sycl/test-e2e/SeparateCompile/same-kernel.cpp
index 8bdadbe7ad62b..27b701a0b1550 100644
--- a/sycl/test-e2e/SeparateCompile/same-kernel.cpp
+++ b/sycl/test-e2e/SeparateCompile/same-kernel.cpp
@@ -12,7 +12,7 @@
 // RUN: %{build} -DB_CPP=1 -c -o %t-same-kernel-b.o
 //
 // >> ---- link the full hetero app
-// RUN: %clangxx %t-same-kernel-a.o %t-same-kernel-b.o -Wno-unused-command-line-argument -o %t-same-kernel.exe -fsycl -fsycl-targets=%{sycl_triple}
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t-same-kernel-a.o %t-same-kernel-b.o -Wno-unused-command-line-argument -o %t-same-kernel.exe
 // RUN: %{run} %t-same-kernel.exe
 
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
index 219634f47646c..d62becf4d5567 100644
--- a/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
+++ b/sycl/test-e2e/SeparateCompile/sycl-external-within-staticlib.cpp
@@ -5,7 +5,7 @@
 // RUN: %{build} -O3 -DSOURCE3 -c -o %t3.o
 // RUN: rm -f %t.a
 // RUN: llvm-ar crv %t.a %t1.o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t.a -Wno-unused-command-line-argument -o %t1.exe
+// RUN: %clangxx -fsycl %{sycl_target_opts} -O3 %t3.o %t.a -Wno-unused-command-line-argument -o %t1.exe
 // RUN: %{run} %t1.exe
 
 // Check the repacked case as it can behave differently.
@@ -13,7 +13,7 @@
 // RUN: echo addlib %t.a >> %t.txt
 // RUN: echo save >> %t.txt
 // RUN: cat %t.txt | llvm-ar -M
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} -O3 %t3.o %t_repacked.a -Wno-unused-command-line-argument -o %t2.exe
+// RUN: %clangxx -fsycl %{sycl_target_opts} -O3 %t3.o %t_repacked.a -Wno-unused-command-line-argument -o %t2.exe
 // RUN: %{run} %t2.exe
 
 #include <iostream>
diff --git a/sycl/test-e2e/SeparateCompile/sycl-external.cpp b/sycl/test-e2e/SeparateCompile/sycl-external.cpp
index 37facb7ecfc57..85e2c97d6512a 100644
--- a/sycl/test-e2e/SeparateCompile/sycl-external.cpp
+++ b/sycl/test-e2e/SeparateCompile/sycl-external.cpp
@@ -2,14 +2,14 @@
 // different object file.
 // RUN: %{build} -DSOURCE1 -c -o %t1.o
 // RUN: %{build} -DSOURCE2 -c -o %t2.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t1.o %t2.o -Wno-unused-command-line-argument -o %t1.exe
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t1.o %t2.o -Wno-unused-command-line-argument -o %t1.exe
 // RUN: %{run} %t1.exe
 //
 // Test2 - check that kernel can call a SYCL_EXTERNAL function defined in a
 // static library.
 // RUN: rm -f %t.a
 // RUN: llvm-ar crv %t.a %t1.o
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %t2.o %t.a -Wno-unused-command-line-argument -o %t2.exe
+// RUN: %clangxx -fsycl %{sycl_target_opts} %t2.o %t.a -Wno-unused-command-line-argument -o %t2.exe
 // RUN: %{run} %t2.exe
 
 #include <iostream>
diff --git a/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp b/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp
index a37e5b8f4238c..86992e63e57fb 100644
--- a/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp
+++ b/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: windows || hip_amd
+// UNSUPPORTED: windows || hip
 // RUN: %{build} -o %t.out
 // RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s
 
diff --git a/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp b/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp
index 4c4299dd93d8e..d7407bbeeab97 100644
--- a/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp
+++ b/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: windows || hip_amd
+// UNSUPPORTED: windows || hip
 // RUN: %{build} -o %t.out
 // RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s
 
diff --git a/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp b/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp
index 61e27b7927f7b..4444ee1b7b903 100644
--- a/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp
+++ b/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: windows || hip_amd
+// UNSUPPORTED: windows || hip
 // RUN: %{build} -o %t.out
 // RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s
 
diff --git a/sycl/test-e2e/USM/memadvise_flags.cpp b/sycl/test-e2e/USM/memadvise_flags.cpp
index 7de2a8a931f30..df2a1b31532ad 100644
--- a/sycl/test-e2e/USM/memadvise_flags.cpp
+++ b/sycl/test-e2e/USM/memadvise_flags.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -o %t1.out
-// REQUIRES: cuda || hip_amd
+// REQUIRES: cuda || hip
 // RUN: %{run} %t1.out
 
 //==---------------- memadvise_flags.cpp -----------------------------------==//
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
index 317447d645b67..90eaa14189ae1 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "copy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
index 48bf7fe13abb6..fa39cfdcaa6f4 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "copy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
index ab766f59c3d10..85498b3a9f993 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "copy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
index de99f08d24096..26497825c4f51 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "copy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
index f01317710c35d..ca57afcc1e206 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "memcpy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
index 0418678424dfc..51916927f3bed 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "memcpy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
index 7e2dcb8a9bd4d..279d62c51cb87 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
@@ -10,7 +10,7 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 // Temporarily disabled until the failure is addressed.
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
index 645adac407f90..f30e80c39b4dd 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
@@ -13,7 +13,7 @@
 // Temporarily disabled until the failure is addressed.
 // UNSUPPORTED: (level_zero && windows)
 
-// UNSUPPORTED: (gpu-intel-dg2 || hip_amd) && linux
+// UNSUPPORTED: (gpu-intel-dg2 || hip) && linux
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15648
 
 #include "memcpy2d_common.hpp"
diff --git a/sycl/test-e2e/USM/memory_coherency_hip.cpp b/sycl/test-e2e/USM/memory_coherency_hip.cpp
index a6cca6620deb7..e060b018025d7 100644
--- a/sycl/test-e2e/USM/memory_coherency_hip.cpp
+++ b/sycl/test-e2e/USM/memory_coherency_hip.cpp
@@ -1,5 +1,5 @@
 // RUN: %{build} -o %t1.out
-// REQUIRES: hip_amd
+// REQUIRES: hip
 // RUN: %{run} %t1.out
 
 //==---- memory_coherency_hip.cpp  -----------------------------------------==//
diff --git a/sycl/test-e2e/VirtualFunctions/2/1/1/missing-overrides.cpp b/sycl/test-e2e/VirtualFunctions/2/1/1/missing-overrides.cpp
index f198bc94f855f..b008effc626c0 100644
--- a/sycl/test-e2e/VirtualFunctions/2/1/1/missing-overrides.cpp
+++ b/sycl/test-e2e/VirtualFunctions/2/1/1/missing-overrides.cpp
@@ -66,6 +66,25 @@ void applyOp(int *DataPtr, Base *ObjPtr) {
   ObjPtr->multiply(DataPtr);
 }
 
+template <typename T1, typename T2> struct KernelFunctor {
+  T1 mStorageAcc;
+  T2 mDataAcc;
+  unsigned mTestCase;
+  KernelFunctor(T1 &StorageAcc, T2 &DataAcc, unsigned TestCase)
+      : mStorageAcc(StorageAcc), mDataAcc(DataAcc), mTestCase(TestCase) {}
+
+  void operator()() const {
+    auto *Ptr =
+        mStorageAcc[0].template construct</* ret type = */ Base>(mTestCase);
+    applyOp(
+        mDataAcc.template get_multi_ptr<sycl::access::decorated::no>().get(),
+        Ptr);
+  }
+  auto get(oneapi::properties_tag) const {
+    return oneapi::properties{oneapi::assume_indirect_calls};
+  }
+};
+
 int main() try {
   using storage_t = obj_storage_t<IncrementBy1, IncrementBy1AndSubstractBy2,
                                   MultiplyBy2, MultiplyBy2AndIncrementBy8,
@@ -80,7 +99,6 @@ int main() try {
 
   sycl::queue q(asyncHandler);
 
-  constexpr oneapi::properties props{oneapi::assume_indirect_calls};
   for (unsigned TestCase = 0; TestCase < 6; ++TestCase) {
     int HostData = 42;
     int Data = HostData;
@@ -89,11 +107,7 @@ int main() try {
     q.submit([&](sycl::handler &CGH) {
       sycl::accessor StorageAcc(DeviceStorage, CGH, sycl::write_only);
       sycl::accessor DataAcc(DataStorage, CGH, sycl::write_only);
-      CGH.single_task(props, [=]() {
-        auto *Ptr = StorageAcc[0].construct</* ret type = */ Base>(TestCase);
-        applyOp(DataAcc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                Ptr);
-      });
+      CGH.single_task(KernelFunctor(StorageAcc, DataAcc, TestCase));
     });
 
     Base *Ptr = HostStorage.construct</* ret type = */ Base>(TestCase);
diff --git a/sycl/test-e2e/VirtualFunctions/2/1/1/more-complex-hierarchy.cpp b/sycl/test-e2e/VirtualFunctions/2/1/1/more-complex-hierarchy.cpp
index bb334972c3f77..d1c2c5fd092a9 100644
--- a/sycl/test-e2e/VirtualFunctions/2/1/1/more-complex-hierarchy.cpp
+++ b/sycl/test-e2e/VirtualFunctions/2/1/1/more-complex-hierarchy.cpp
@@ -45,6 +45,25 @@ class IncrementBy8 : public IncrementOp {
 
 void applyOp(int *Data, AbstractOp *Obj) { Obj->applyOp(Data); }
 
+template <typename T1, typename T2> struct KernelFunctor {
+  T1 mStorageAcc;
+  T2 mDataAcc;
+  unsigned mTestCase;
+  KernelFunctor(T1 &StorageAcc, T2 &DataAcc, unsigned TestCase)
+      : mStorageAcc(StorageAcc), mDataAcc(DataAcc), mTestCase(TestCase) {}
+
+  void operator()() const {
+    auto *Ptr = mStorageAcc[0].template construct</* ret type = */ AbstractOp>(
+        mTestCase);
+    applyOp(
+        mDataAcc.template get_multi_ptr<sycl::access::decorated::no>().get(),
+        Ptr);
+  }
+  auto get(oneapi::properties_tag) const {
+    return oneapi::properties{oneapi::assume_indirect_calls};
+  }
+};
+
 int main() try {
   using storage_t =
       obj_storage_t<IncrementBy1, IncrementBy2, IncrementBy4, IncrementBy8>;
@@ -59,7 +78,6 @@ int main() try {
 
   sycl::queue q(asyncHandler);
 
-  constexpr oneapi::properties props{oneapi::assume_indirect_calls};
   for (unsigned TestCase = 0; TestCase < 4; ++TestCase) {
     int HostData = 42;
     int Data = HostData;
@@ -68,12 +86,7 @@ int main() try {
     q.submit([&](sycl::handler &CGH) {
       sycl::accessor StorageAcc(DeviceStorage, CGH, sycl::write_only);
       sycl::accessor DataAcc(DataStorage, CGH, sycl::write_only);
-      CGH.single_task(props, [=]() {
-        auto *Ptr =
-            StorageAcc[0].construct</* ret type = */ AbstractOp>(TestCase);
-        applyOp(DataAcc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                Ptr);
-      });
+      CGH.single_task(KernelFunctor(StorageAcc, DataAcc, TestCase));
     });
 
     auto *Ptr = HostStorage.construct</* ret type = */ AbstractOp>(TestCase);
diff --git a/sycl/test-e2e/VirtualFunctions/2/1/1/simple-hierarchy.cpp b/sycl/test-e2e/VirtualFunctions/2/1/1/simple-hierarchy.cpp
index 2bfb3dd0f010d..aad1e1ccecffa 100644
--- a/sycl/test-e2e/VirtualFunctions/2/1/1/simple-hierarchy.cpp
+++ b/sycl/test-e2e/VirtualFunctions/2/1/1/simple-hierarchy.cpp
@@ -30,6 +30,24 @@ class IncrementBy8 : public BaseIncrement {
   void increment(int *Data) override { *Data += 8; }
 };
 
+template <typename T1, typename T2> struct KernelFunctor {
+  T1 mStorageAcc;
+  T2 mDataAcc;
+  unsigned mTestCase;
+  KernelFunctor(T1 &StorageAcc, T2 &DataAcc, unsigned TestCase)
+      : mStorageAcc(StorageAcc), mDataAcc(DataAcc), mTestCase(TestCase) {}
+  void operator()() const {
+    auto *Ptr =
+        mStorageAcc[0].template construct</* ret type = */ BaseIncrement>(
+            mTestCase);
+    Ptr->increment(
+        mDataAcc.template get_multi_ptr<sycl::access::decorated::no>().get());
+  }
+  auto get(oneapi::properties_tag) const {
+    return oneapi::properties{oneapi::assume_indirect_calls};
+  }
+};
+
 int main() try {
   using storage_t =
       obj_storage_t<BaseIncrement, IncrementBy2, IncrementBy4, IncrementBy8>;
@@ -44,7 +62,6 @@ int main() try {
 
   sycl::queue q(asyncHandler);
 
-  constexpr oneapi::properties props{oneapi::assume_indirect_calls};
   for (unsigned TestCase = 0; TestCase < 4; ++TestCase) {
     int HostData = 42;
     int Data = HostData;
@@ -53,12 +70,7 @@ int main() try {
     q.submit([&](sycl::handler &CGH) {
       sycl::accessor StorageAcc(DeviceStorage, CGH, sycl::write_only);
       sycl::accessor DataAcc(DataStorage, CGH, sycl::write_only);
-      CGH.single_task(props, [=]() {
-        auto *Ptr =
-            StorageAcc[0].construct</* ret type = */ BaseIncrement>(TestCase);
-        Ptr->increment(
-            DataAcc.get_multi_ptr<sycl::access::decorated::no>().get());
-      });
+      CGH.single_task(KernelFunctor(StorageAcc, DataAcc, TestCase));
     });
 
     auto *Ptr = HostStorage.construct</* ret type = */ BaseIncrement>(TestCase);
diff --git a/sycl/test-e2e/VirtualFunctions/2/2/single-construct-single-use.cpp b/sycl/test-e2e/VirtualFunctions/2/2/single-construct-single-use.cpp
index ccf0c77036085..467d4e5b006c1 100644
--- a/sycl/test-e2e/VirtualFunctions/2/2/single-construct-single-use.cpp
+++ b/sycl/test-e2e/VirtualFunctions/2/2/single-construct-single-use.cpp
@@ -57,6 +57,22 @@ class IncrementBy16 : public BaseIncrement {
   void increment(int *Data) override { *Data += 16 + Mod; }
 };
 
+template <typename T1, typename T2> struct KernelFunctor {
+  T1 mStorageAcc;
+  T2 mDataAcc;
+  KernelFunctor(T1 &StorageAcc, T2 &DataAcc)
+      : mStorageAcc(StorageAcc), mDataAcc(DataAcc) {}
+  void operator()() const {
+    auto *Ptr = mStorageAcc[0].template getAs<BaseIncrement>();
+    Ptr->increment(
+        mDataAcc.template get_multi_ptr<sycl::access::decorated::no>().get());
+  }
+  auto get(oneapi::properties_tag) const {
+    return oneapi::properties{
+        oneapi::assume_indirect_calls_to<void, SetIncBy16>};
+  }
+};
+
 int main() try {
   using storage_t = obj_storage_t<BaseIncrement, IncrementBy2, IncrementBy4,
                                   IncrementBy8, IncrementBy16>;
@@ -72,8 +88,6 @@ int main() try {
   sycl::queue q(asyncHandler);
 
   // TODO: cover uses case when objects are passed through USM
-  constexpr oneapi::properties props{
-      oneapi::assume_indirect_calls_to<void, SetIncBy16>};
   for (unsigned TestCase = 0; TestCase < 5; ++TestCase) {
     int HostData = 42;
     int Data = HostData;
@@ -90,11 +104,7 @@ int main() try {
     q.submit([&](sycl::handler &CGH) {
       sycl::accessor StorageAcc(DeviceStorage, CGH, sycl::read_write);
       sycl::accessor DataAcc(DataStorage, CGH, sycl::write_only);
-      CGH.single_task(props, [=]() {
-        auto *Ptr = StorageAcc[0].getAs<BaseIncrement>();
-        Ptr->increment(
-            DataAcc.get_multi_ptr<sycl::access::decorated::no>().get());
-      });
+      CGH.single_task(KernelFunctor(StorageAcc, DataAcc));
     });
 
     auto *Ptr =
diff --git a/sycl/test-e2e/VirtualFunctions/misc/math.cpp b/sycl/test-e2e/VirtualFunctions/misc/math.cpp
index 71b34c23cef1f..da0570ce0291e 100644
--- a/sycl/test-e2e/VirtualFunctions/misc/math.cpp
+++ b/sycl/test-e2e/VirtualFunctions/misc/math.cpp
@@ -40,6 +40,21 @@ class RoundOp : public BaseOp {
   virtual float apply(float V) { return sycl::round(V); }
 };
 
+template <typename T1, typename T2> struct KernelFunctor {
+  T1 mDataAcc;
+  T2 mDeviceStorage;
+  KernelFunctor(T1 &DataAcc, T2 &DeviceStorage)
+      : mDataAcc(DataAcc), mDeviceStorage(DeviceStorage) {}
+
+  void operator()() const {
+    auto *Ptr = mDeviceStorage->template getAs<BaseOp>();
+    mDataAcc[0] = Ptr->apply(mDataAcc[0]);
+  }
+  auto get(oneapi::properties_tag) const {
+    return oneapi::properties{oneapi::assume_indirect_calls};
+  }
+};
+
 int main() try {
   using storage_t = obj_storage_t<FloorOp, CeilOp, RoundOp>;
 
@@ -49,7 +64,6 @@ int main() try {
 
   auto *DeviceStorage = sycl::malloc_shared<storage_t>(1, q);
 
-  constexpr oneapi::properties props{oneapi::assume_indirect_calls};
   for (unsigned TestCase = 0; TestCase < 3; ++TestCase) {
     float HostData = 3.56;
     float Data = HostData;
@@ -63,10 +77,7 @@ int main() try {
 
     q.submit([&](sycl::handler &CGH) {
       sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write);
-      CGH.single_task(props, [=]() {
-        auto *Ptr = DeviceStorage->getAs<BaseOp>();
-        DataAcc[0] = Ptr->apply(DataAcc[0]);
-      });
+      CGH.single_task(KernelFunctor(DataAcc, DeviceStorage));
     });
 
     auto *Ptr = HostStorage.construct</* ret type = */ BaseOp>(TestCase);
diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py
index a69810145507d..29e89e759bb96 100644
--- a/sycl/test-e2e/format.py
+++ b/sycl/test-e2e/format.py
@@ -13,14 +13,11 @@
 import re
 
 
-def get_triple(test, backend):
+def get_triple(backend):
     if backend == "cuda":
         return "nvptx64-nvidia-cuda"
     if backend == "hip":
-        if test.config.hip_platform == "NVIDIA":
-            return "nvptx64-nvidia-cuda"
-        else:
-            return "amdgcn-amd-amdhsa"
+        return "amdgcn-amd-amdhsa"
     if backend == "native_cpu":
         return "native_cpu"
     return "spir64"
@@ -171,17 +168,27 @@ def execute(self, test, litConfig):
 
             for sycl_device in devices_for_test:
                 (backend, _) = sycl_device.split(":")
-                triples.add(get_triple(test, backend))
+                triples.add(get_triple(backend))
 
         substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase)
+
         substitutions.append(("%{sycl_triple}", format(",".join(triples))))
-        # -fsycl-targets is needed for CUDA/HIP, so just use it be default so
-        # -that new tests by default would runnable there (unless they have
-        # -other restrictions).
+
+        sycl_target_opts = "-fsycl-targets=%{sycl_triple}"
+        if get_triple("hip") in triples:
+            hip_arch_opts = (
+                " -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch={}".format(
+                    test.config.amd_arch
+                )
+            )
+            sycl_target_opts += hip_arch_opts
+            substitutions.append(("%{hip_arch_opts}", hip_arch_opts))
+        substitutions.append(("%{sycl_target_opts}", sycl_target_opts))
+
         substitutions.append(
             (
                 "%{build}",
-                "%clangxx -fsycl -fsycl-targets=%{sycl_triple} %verbose_print %s",
+                "%clangxx -fsycl %{sycl_target_opts} %verbose_print %s",
             )
         )
         if platform.system() == "Windows":
diff --git a/sycl/test-e2e/forward_progress/forward_progress_kernel_param_L0_gpu.cpp b/sycl/test-e2e/forward_progress/forward_progress_kernel_param_L0_gpu.cpp
index 003840a8c1299..b8b80b9541569 100644
--- a/sycl/test-e2e/forward_progress/forward_progress_kernel_param_L0_gpu.cpp
+++ b/sycl/test-e2e/forward_progress/forward_progress_kernel_param_L0_gpu.cpp
@@ -23,41 +23,42 @@ void check_props(sycl::queue &q) {}
 
 // Full specializations for each progress guarantee
 
+template <typename T> struct KernelFunctor {
+  T props;
+  KernelFunctor(const T &props_) : props(props_) {}
+  void operator()() const {}
+  auto get(properties_tag) const { return props; }
+};
+
 template <>
 void check_props<forward_progress_guarantee::parallel>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::parallel;
   // Check properties at execution_scope::root_group coordination level
-  q.single_task(
-      properties{work_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_group_progress<guarantee, execution_scope::root_group>}));
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::root_group>}));
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::work_group coordination level
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::work_group>}));
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::sub_group coordination level
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::sub_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(
+        properties{work_item_progress<guarantee, execution_scope::sub_group>}));
   } catch (sycl::exception &ex) {
   }
 }
@@ -66,66 +67,54 @@ template <>
 void check_props<forward_progress_guarantee::weakly_parallel>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::weakly_parallel;
   // Check properties at execution_scope::root_group coordination level
-  q.single_task(
-      properties{work_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_group_progress<guarantee, execution_scope::root_group>}));
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::root_group>}));
 
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::root_group>}));
 
   // Check properties at execution_scope::work_group coordination level
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::work_group>}));
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::work_group>}));
 
   // Check properties at execution_scope::sub_group coordination level
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::sub_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::sub_group>}));
 }
 
 template <>
 void check_props<forward_progress_guarantee::concurrent>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::concurrent;
   // Check properties at execution_scope::root_group coordination level
-  q.single_task(
-      properties{work_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_group_progress<guarantee, execution_scope::root_group>}));
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::root_group>}));
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::work_group coordination level
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::work_group>}));
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::sub_group coordination level
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::sub_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(
+        properties{work_item_progress<guarantee, execution_scope::sub_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
diff --git a/sycl/test-e2e/forward_progress/forward_progress_kernel_param_ocl_cpu.cpp b/sycl/test-e2e/forward_progress/forward_progress_kernel_param_ocl_cpu.cpp
index ffdd99184d233..8647d42ee24c3 100644
--- a/sycl/test-e2e/forward_progress/forward_progress_kernel_param_ocl_cpu.cpp
+++ b/sycl/test-e2e/forward_progress/forward_progress_kernel_param_ocl_cpu.cpp
@@ -22,49 +22,50 @@ void check_props(sycl::queue &q) {}
 
 // Full specializations for each progress guarantee
 
+template <typename T> struct KernelFunctor {
+  T props;
+  KernelFunctor(const T &props_) : props(props_) {}
+  void operator()() const {}
+  auto get(properties_tag) const { return props; }
+};
+
 template <>
 void check_props<forward_progress_guarantee::parallel>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::parallel;
   // Check properties at execution_scope::root_group coordination level
-  q.single_task(
-      properties{work_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_group_progress<guarantee, execution_scope::root_group>}));
   try {
-    q.single_task(
-        properties{sub_group_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        sub_group_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::work_group coordination level
   try {
-    q.single_task(
-        properties{sub_group_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        sub_group_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::sub_group coordination level
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::sub_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(
+        properties{work_item_progress<guarantee, execution_scope::sub_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
@@ -74,29 +75,23 @@ template <>
 void check_props<forward_progress_guarantee::weakly_parallel>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::weakly_parallel;
   // Check properties at execution_scope::root_group coordination level
-  q.single_task(
-      properties{work_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_group_progress<guarantee, execution_scope::root_group>}));
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::root_group>}));
 
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::root_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::root_group>}));
 
   // Check properties at execution_scope::work_group coordination level
-  q.single_task(
-      properties{sub_group_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::work_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{sub_group_progress<guarantee, execution_scope::work_group>}));
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::work_group>}));
 
   // Check properties at execution_scope::sub_group coordination level
-  q.single_task(
-      properties{work_item_progress<guarantee, execution_scope::sub_group>},
-      [=]() {});
+  q.single_task(KernelFunctor(
+      properties{work_item_progress<guarantee, execution_scope::sub_group>}));
 }
 
 template <>
@@ -104,48 +99,42 @@ void check_props<forward_progress_guarantee::concurrent>(sycl::queue &q) {
   constexpr auto guarantee = forward_progress_guarantee::concurrent;
   // Check properties at execution_scope::root_group coordination level
   try {
-    q.single_task(
-        properties{work_group_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_group_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
   try {
-    q.single_task(
-        properties{sub_group_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        sub_group_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::root_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::root_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::work_group coordination level
   try {
-    q.single_task(
-        properties{sub_group_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        sub_group_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::work_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(properties{
+        work_item_progress<guarantee, execution_scope::work_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
 
   // Check properties at execution_scope::sub_group coordination level
   try {
-    q.single_task(
-        properties{work_item_progress<guarantee, execution_scope::sub_group>},
-        [=]() {});
+    q.single_task(KernelFunctor(
+        properties{work_item_progress<guarantee, execution_scope::sub_group>}));
     assert(false && "Expected exception not seen!");
   } catch (sycl::exception &ex) {
   }
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index a374cfaee402f..9ffb648c6bc7c 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -524,18 +524,6 @@ def open_check_file(file_name):
     if be not in available_devices or dev not in available_devices[be]:
         lit_config.error("Unsupported device {}".format(d))
 
-# If HIP_PLATFORM flag is not set, default to AMD, and check if HIP platform is supported
-supported_hip_platforms = ["AMD", "NVIDIA"]
-if config.hip_platform == "":
-    config.hip_platform = "AMD"
-if config.hip_platform not in supported_hip_platforms:
-    lit_config.error(
-        "Unknown HIP platform '"
-        + config.hip_platform
-        + "' supported platforms are "
-        + ", ".join(supported_hip_platforms)
-    )
-
 if "cuda:gpu" in config.sycl_devices:
     if "CUDA_PATH" not in os.environ:
         if platform.system() == "Windows":
@@ -697,8 +685,6 @@ def open_check_file(file_name):
 # discovered already.
 config.sycl_dev_features = {}
 
-# Architecture flag for compiling for AMD HIP devices. Empty otherwise.
-arch_flag = ""
 # Version of the driver for a given device. Empty for non-Intel devices.
 config.intel_driver_ver = {}
 for sycl_device in config.sycl_devices:
@@ -839,7 +825,7 @@ def open_check_file(file_name):
     # Use short names for LIT rules.
     features.add(be)
 
-    if be == "hip" and config.hip_platform == "AMD":
+    if be == "hip":
         if not config.amd_arch:
             # Guaranteed to be a single element in the set
             arch = [x for x in architecture_feature][0]
@@ -850,15 +836,9 @@ def open_check_file(file_name):
                 )
             config.amd_arch = arch.replace(amd_arch_prefix, "")
         llvm_config.with_system_environment("ROCM_PATH")
-        config.available_features.add("hip_amd")
-        arch_flag = (
-            "-Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=" + config.amd_arch
-        )
         config.substitutions.append(
             ("%rocm_path", os.environ.get("ROCM_PATH", "/opt/rocm"))
         )
-    elif be == "hip" and config.hip_platform == "NVIDIA":
-        config.available_features.add("hip_nvidia")
 
     config.sycl_dev_features[sycl_device] = features.union(config.available_features)
     if is_intel_driver:
@@ -871,10 +851,7 @@ def open_check_file(file_name):
     config.substitutions.append(("%clang", " true "))
 else:
     config.substitutions.append(
-        (
-            "%clangxx",
-            " " + config.dpcpp_compiler + " " + config.cxx_flags + " " + arch_flag,
-        )
+        ("%clangxx", " " + config.dpcpp_compiler + " " + config.cxx_flags)
     )
     config.substitutions.append(
         ("%clang", " " + config.dpcpp_compiler + " " + config.c_flags)
diff --git a/sycl/test-e2e/lit.site.cfg.py.in b/sycl/test-e2e/lit.site.cfg.py.in
index a6b86cb73d505..00928dd9141fc 100644
--- a/sycl/test-e2e/lit.site.cfg.py.in
+++ b/sycl/test-e2e/lit.site.cfg.py.in
@@ -30,7 +30,6 @@ config.igc_tag_file = os.path.join("/usr/local/lib/igc/", 'IGCTAG.txt')
 
 config.sycl_devices = lit_config.params.get("sycl_devices", "@SYCL_TEST_E2E_TARGETS@").split(';')
 
-config.hip_platform = "@HIP_PLATFORM@"
 config.amd_arch = lit_config.params.get("amd_arch", "@AMD_ARCH@")
 config.sycl_threads_lib = '@SYCL_THREADS_LIB@'
 config.extra_environment = lit_config.params.get("extra_environment", "@LIT_EXTRA_ENVIRONMENT@")
diff --git a/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp b/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp
index eca55f738d83a..d93a7880d404e 100644
--- a/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp
+++ b/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp
@@ -2,6 +2,6 @@
 // TODO: Supported for ROCM 5. Further development required to support AMDGPU.
 // UNSUPPORTED: hip
 
-// RUN: %clangxx -fPIC -shared -fsycl -fsycl-targets=%{sycl_triple} %S/Inputs/kernel_module.cpp -o %t.so
-// RUN: %clangxx -DTEST_SHARED_LIB='"%t.so"' -ldl -fsycl -fsycl-targets=%{sycl_triple} %S/Inputs/kernel_function.cpp -o %t.out
+// RUN: %clangxx -fPIC -shared -fsycl %{sycl_target_opts} %S/Inputs/kernel_module.cpp -o %t.so
+// RUN: %clangxx -DTEST_SHARED_LIB='"%t.so"' -ldl -fsycl %{sycl_target_opts} %S/Inputs/kernel_function.cpp -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp b/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp
index 02ec26ab78a48..85ecf5687ca63 100644
--- a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp
+++ b/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp
@@ -3,6 +3,6 @@
 
 // DEFINE: %{sharedflag} = %if cl_options %{/clang:-shared%} %else %{-shared%}
 
-// RUN: %clangxx %{sharedflag} -fsycl -fsycl-targets=%{sycl_triple} %S\Inputs\kernel_module.cpp -o %t.dll
-// RUN: %clangxx -DTEST_SHARED_LIB='"%/t.dll"' -fsycl -fsycl-targets=%{sycl_triple} %S\Inputs\kernel_function.cpp -o %t.out
+// RUN: %clangxx %{sharedflag} -fsycl %{sycl_target_opts} %S\Inputs\kernel_module.cpp -o %t.dll
+// RUN: %clangxx -DTEST_SHARED_LIB='"%/t.dll"' -fsycl %{sycl_target_opts} %S\Inputs\kernel_function.cpp -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
index 033f5c99d74e1..fa253b8f1666f 100644
--- a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
+++ b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
@@ -23,12 +23,9 @@
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
-// UNSUPPORTED: linux && opencl && (gpu-intel-gen12 || gpu-intel-dg2)
+// UNSUPPORTED: linux && opencl && (gpu-intel-gen12 || gpu-intel-dg2 || arch-intel_gpu_pvc)
 // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15275
 
-// XFAIL: arch-intel_gpu_pvc
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401
-
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/group_barrier.hpp>
diff --git a/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp b/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp
index b9b274aa2442b..6a850a887eb18 100644
--- a/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp
@@ -29,7 +29,7 @@ void checkResult(const string &FuncName, const vector<unsigned int> &Inputs,
   for (size_t i = 1; i < Inputs.size(); ++i) {
     cout << ", " << Inputs[i];
   }
-  cout << ") = " << DeviceResult << " (expect " << Expect << ")";
+  cout << ") = " << DeviceResult << " (expect " << Expect << ")" << endl;
   assert(DeviceResult == Expect);
 }
 
@@ -43,19 +43,24 @@ void testVabs2Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabs2(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabs2", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabs2(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabs2", {TestCase.first}, TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabs4(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -68,19 +73,24 @@ void testVabs4Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabs4(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabs4", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabs4(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabs4", {TestCase.first}, TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsdiffs2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -95,22 +105,27 @@ void testVabsdiffs2Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsdiffs2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsdiffs2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsdiffs2(DeviceResult, TestCase_first_first_ct1,
+                         TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsdiffs2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsdiffs4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -125,22 +140,27 @@ void testVabsdiffs4Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsdiffs4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsdiffs4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsdiffs4(DeviceResult, TestCase_first_first_ct1,
+                         TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsdiffs4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsdiffu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -155,22 +175,27 @@ void testVabsdiffu2Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsdiffu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsdiffu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsdiffu2(DeviceResult, TestCase_first_first_ct1,
+                         TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsdiffu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsdiffu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -185,22 +210,27 @@ void testVabsdiffu4Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsdiffu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsdiffu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsdiffu4(DeviceResult, TestCase_first_first_ct1,
+                         TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsdiffu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsss2(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -214,19 +244,25 @@ void testVabsss2Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsss2(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsss2", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsss2(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsss2", {TestCase.first}, TestCase.second,
+                  *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vabsss4(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -240,19 +276,25 @@ void testVabsss4Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vabsss4(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vabsss4", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vabsss4(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vabsss4", {TestCase.first}, TestCase.second,
+                  *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vadd2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -266,22 +308,27 @@ void testVadd2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vadd2(DeviceResult, TestCase_first_first_ct1,
-                  TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vadd2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vadd2(DeviceResult, TestCase_first_first_ct1,
+                    TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vadd2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vadd4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -295,22 +342,27 @@ void testVadd4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vadd4(DeviceResult, TestCase_first_first_ct1,
-                  TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vadd4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vadd4(DeviceResult, TestCase_first_first_ct1,
+                    TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vadd4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vaddss2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -324,22 +376,27 @@ void testVaddss2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vaddss2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vaddss2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vaddss2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vaddss2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vaddss4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -353,22 +410,27 @@ void testVaddss4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vaddss4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vaddss4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vaddss4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vaddss4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vaddus2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -382,22 +444,27 @@ void testVaddus2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vaddus2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vaddus2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vaddus2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vaddus2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vaddus4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -411,22 +478,27 @@ void testVaddus4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vaddus4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vaddus4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vaddus4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vaddus4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vavgs2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -440,22 +512,27 @@ void testVavgs2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vavgs2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vavgs2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vavgs2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vavgs2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vavgs4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -469,22 +546,27 @@ void testVavgs4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vavgs4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vavgs4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vavgs4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vavgs4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vavgu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -498,22 +580,27 @@ void testVavgu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vavgu2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vavgu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vavgu2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vavgu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vavgu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -527,22 +614,27 @@ void testVavgu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vavgu4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vavgu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vavgu4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vavgu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpeq2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -556,22 +648,27 @@ void testVcmpeq2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpeq2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpeq2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpeq2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpeq2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpeq4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -585,22 +682,27 @@ void testVcmpeq4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpeq4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpeq4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpeq4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpeq4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpges2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -614,22 +716,27 @@ void testVcmpges2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpges2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpges2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpges2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpges2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpges4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -643,22 +750,27 @@ void testVcmpges4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpges4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpges4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpges4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpges4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgeu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -672,22 +784,27 @@ void testVcmpgeu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgeu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgeu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgeu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgeu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgeu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -701,22 +818,27 @@ void testVcmpgeu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgeu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgeu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgeu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgeu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgts2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -730,22 +852,27 @@ void testVcmpgts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgts2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgts2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgts2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgts2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgts4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -759,22 +886,27 @@ void testVcmpgts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgts4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgts4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgts4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgts4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgtu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -788,22 +920,27 @@ void testVcmpgtu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgtu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgtu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgtu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgtu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpgtu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -817,22 +954,27 @@ void testVcmpgtu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpgtu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpgtu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpgtu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpgtu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmples2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -846,22 +988,27 @@ void testVcmples2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmples2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmples2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmples2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmples2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmples4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -875,22 +1022,27 @@ void testVcmples4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmples4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmples4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmples4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmples4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpleu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -904,22 +1056,27 @@ void testVcmpleu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpleu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpleu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpleu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpleu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpleu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -933,22 +1090,27 @@ void testVcmpleu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpleu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpleu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpleu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpleu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmplts2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -962,22 +1124,27 @@ void testVcmplts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmplts2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmplts2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmplts2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmplts2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmplts4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -991,22 +1158,27 @@ void testVcmplts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmplts4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmplts4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmplts4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmplts4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpltu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1020,22 +1192,27 @@ void testVcmpltu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpltu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpltu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpltu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpltu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpltu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1049,22 +1226,27 @@ void testVcmpltu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpltu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpltu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpltu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpltu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpne2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1078,22 +1260,27 @@ void testVcmpne2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpne2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpne2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpne2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpne2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vcmpne4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1107,22 +1294,27 @@ void testVcmpne4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vcmpne4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vcmpne4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vcmpne4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vcmpne4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vhaddu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1136,22 +1328,27 @@ void testVhaddu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vhaddu2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vhaddu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vhaddu2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vhaddu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vhaddu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1165,22 +1362,27 @@ void testVhaddu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vhaddu4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vhaddu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vhaddu4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vhaddu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmaxs2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1194,22 +1396,27 @@ void testVmaxs2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmaxs2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmaxs2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmaxs2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmaxs2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmaxs4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1223,22 +1430,27 @@ void testVmaxs4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmaxs4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmaxs4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmaxs4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmaxs4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmaxu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1252,22 +1464,27 @@ void testVmaxu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmaxu2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmaxu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmaxu2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmaxu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmaxu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1281,22 +1498,27 @@ void testVmaxu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmaxu4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmaxu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmaxu4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmaxu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmins2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1310,22 +1532,27 @@ void testVmins2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmins2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmins2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmins2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmins2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vmins4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1339,22 +1566,27 @@ void testVmins4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vmins4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vmins4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vmins4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vmins4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vminu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1368,22 +1600,27 @@ void testVminu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vminu2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vminu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vminu2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vminu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vminu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1397,22 +1634,27 @@ void testVminu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vminu4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vminu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vminu4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vminu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vneg2(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -1425,19 +1667,24 @@ void testVneg2Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vneg2(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vneg2", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vneg2(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vneg2", {TestCase.first}, TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vneg4(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -1450,19 +1697,24 @@ void testVneg4Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vneg4(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vneg4", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vneg4(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vneg4", {TestCase.first}, TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vnegss2(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -1476,19 +1728,25 @@ void testVnegss2Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vnegss2(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vnegss2", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vnegss2(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vnegss2", {TestCase.first}, TestCase.second,
+                  *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vnegss4(unsigned int *const DeviceResult, unsigned int Input1) {
@@ -1502,19 +1760,25 @@ void testVnegss4Cases(
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_ct1 = TestCase.first;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_ct1 = TestCase.first;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vnegss4(DeviceResult, TestCase_first_ct1);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vnegss4", {TestCase.first}, TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vnegss4(DeviceResult, TestCase_first_ct1);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vnegss4", {TestCase.first}, TestCase.second,
+                  *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsads2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1528,22 +1792,27 @@ void testVsads2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsads2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsads2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsads2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsads2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsads4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1557,22 +1826,27 @@ void testVsads4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsads4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsads4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsads4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsads4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsadu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1586,22 +1860,27 @@ void testVsadu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsadu2(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsadu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsadu2(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsadu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsadu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1615,22 +1894,27 @@ void testVsadu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsadu4(DeviceResult, TestCase_first_first_ct1,
-                   TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsadu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsadu4(DeviceResult, TestCase_first_first_ct1,
+                     TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsadu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vseteq2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1644,22 +1928,27 @@ void testVseteq2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vseteq2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vseteq2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vseteq2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vseteq2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vseteq4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1673,22 +1962,27 @@ void testVseteq4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vseteq4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vseteq4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vseteq4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vseteq4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetges2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1702,22 +1996,27 @@ void testVsetges2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetges2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetges2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetges2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetges2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetges4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1731,22 +2030,27 @@ void testVsetges4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetges4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetges4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetges4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetges4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgeu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1760,22 +2064,27 @@ void testVsetgeu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgeu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgeu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgeu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgeu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgeu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1789,22 +2098,27 @@ void testVsetgeu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgeu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgeu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgeu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgeu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgts2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1818,22 +2132,27 @@ void testVsetgts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgts2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgts2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgts2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgts2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgts4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1847,22 +2166,27 @@ void testVsetgts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgts4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgts4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgts4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgts4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgtu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1876,22 +2200,27 @@ void testVsetgtu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgtu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgtu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgtu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgtu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetgtu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1905,22 +2234,27 @@ void testVsetgtu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetgtu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetgtu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetgtu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetgtu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetles2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1934,22 +2268,27 @@ void testVsetles2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetles2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetles2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetles2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetles2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetles4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1963,22 +2302,27 @@ void testVsetles4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetles4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetles4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetles4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetles4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetleu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -1992,22 +2336,27 @@ void testVsetleu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetleu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetleu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetleu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetleu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetleu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2021,22 +2370,27 @@ void testVsetleu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetleu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetleu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetleu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetleu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetlts2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2050,22 +2404,27 @@ void testVsetlts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetlts2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetlts2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetlts2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetlts2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetlts4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2079,22 +2438,27 @@ void testVsetlts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetlts4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetlts4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetlts4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetlts4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetltu2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2108,22 +2472,27 @@ void testVsetltu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetltu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetltu2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetltu2(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetltu2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetltu4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2137,22 +2506,27 @@ void testVsetltu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetltu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetltu4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetltu4(DeviceResult, TestCase_first_first_ct1,
+                       TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetltu4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetne2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2166,22 +2540,27 @@ void testVsetne2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetne2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetne2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetne2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetne2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsetne4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2195,22 +2574,27 @@ void testVsetne4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsetne4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsetne4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsetne4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsetne4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsub2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2224,22 +2608,27 @@ void testVsub2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsub2(DeviceResult, TestCase_first_first_ct1,
-                  TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsub2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsub2(DeviceResult, TestCase_first_first_ct1,
+                    TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsub2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsub4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2253,22 +2642,27 @@ void testVsub4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsub4(DeviceResult, TestCase_first_first_ct1,
-                  TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsub4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsub4(DeviceResult, TestCase_first_first_ct1,
+                    TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsub4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsubss2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2282,22 +2676,27 @@ void testVsubss2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsubss2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsubss2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsubss2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsubss2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsubss4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2311,22 +2710,27 @@ void testVsubss4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsubss4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsubss4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsubss4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsubss4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsubus2(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2340,22 +2744,27 @@ void testVsubus2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsubus2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsubus2", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsubus2(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsubus2", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 void vsubus4(unsigned int *const DeviceResult, unsigned int Input1,
@@ -2369,22 +2778,27 @@ void testVsubus4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
   unsigned int *DeviceResult;
   DeviceResult =
       (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  for (const auto &TestCase : TestCases) {
-    q_ct1.submit([&](sycl::handler &cgh) {
-      auto TestCase_first_first_ct1 = TestCase.first.first;
-      auto TestCase_first_second_ct2 = TestCase.first.second;
+  try {
+    for (const auto &TestCase : TestCases) {
+      q_ct1.submit([&](sycl::handler &cgh) {
+        auto TestCase_first_first_ct1 = TestCase.first.first;
+        auto TestCase_first_second_ct2 = TestCase.first.second;
 
-      cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item_ct1) {
-            vsubus4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-          });
-    });
-    q_ct1.wait();
-    checkResult("__vsubus4", {TestCase.first.first, TestCase.first.second},
-                TestCase.second, *DeviceResult);
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
+            [=](sycl::nd_item<3> item_ct1) {
+              vsubus4(DeviceResult, TestCase_first_first_ct1,
+                      TestCase_first_second_ct2);
+            });
+      });
+      q_ct1.wait_and_throw();
+      checkResult("__vsubus4", {TestCase.first.first, TestCase.first.second},
+                  TestCase.second, *DeviceResult);
+    }
+  } catch (...) {
+    // Intentionally left empty to make sure allocated memory can be freed.
   }
+  sycl::free(DeviceResult, q_ct1);
 }
 
 int main() {
diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index b0b0629cffbd6..c0a02a874b962 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -68,10 +68,10 @@ add_lit_testsuite(check-sycl-deploy "Running the SYCL regression tests"
 set_target_properties(check-sycl-deploy PROPERTIES FOLDER "SYCL tests")
 
 set(TRIPLES "spir64-unknown-unknown")
-if (SYCL_BUILD_BACKEND_CUDA OR (SYCL_BUILD_BACKEND_HIP AND "${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "NVIDIA"))
+if (SYCL_BUILD_BACKEND_CUDA)
   set(TRIPLES "${TRIPLES},nvptx64-nvidia-cuda")
 endif()
-if ((SYCL_BUILD_BACKEND_HIP AND "${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD"))
+if (SYCL_BUILD_BACKEND_HIP)
   set(TRIPLES "${TRIPLES},amdgcn-amd-amdhsa")
 endif()
 
diff --git a/sycl/test/basic_tests/macros.cpp b/sycl/test/basic_tests/macros.cpp
index 9eb09d27b1b9e..e9163d4677606 100644
--- a/sycl/test/basic_tests/macros.cpp
+++ b/sycl/test/basic_tests/macros.cpp
@@ -1,12 +1,12 @@
 // RUN: %clangxx %fsycl-host-only -dM -E %s -o %t.host
 // RUN:                %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown -fsycl-device-only -dM -E %s -o %t.device.spirv
 // RUN: %if cuda    %{ %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda    -fsycl-device-only -dM -E %s -o %t.device.cuda %}
-// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa      -fsycl-device-only -dM -E %s -o %t.device.hip %}
+// RUN: %if hip     %{ %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa      -fsycl-device-only -dM -E %s -o %t.device.hip %}
 //
 // RUN: FileCheck %s < %t.host --check-prefixes=COMMON --implicit-check-not=__SPIRV
 // RUN:                FileCheck %s < %t.device.spirv --check-prefixes=DEVICE,COMMON --implicit-check-not=__SPIRV
 // RUN: %if cuda    %{ FileCheck %s < %t.device.cuda  --check-prefixes=DEVICE,COMMON --implicit-check-not=__SPIRV %}
-// RUN: %if hip_amd %{ FileCheck %s < %t.device.hip   --check-prefixes=DEVICE,COMMON --implicit-check-not=__SPIRV %}
+// RUN: %if hip     %{ FileCheck %s < %t.device.hip   --check-prefixes=DEVICE,COMMON --implicit-check-not=__SPIRV %}
 //
 // FIXME: we should also check that we don't leak __SYCL* and SYCL* macro from
 //        our header files.
diff --git a/sycl/test/basic_tests/macros_no_rdc.cpp b/sycl/test/basic_tests/macros_no_rdc.cpp
index aed25568e6a6e..22a48012a9006 100644
--- a/sycl/test/basic_tests/macros_no_rdc.cpp
+++ b/sycl/test/basic_tests/macros_no_rdc.cpp
@@ -2,17 +2,17 @@
 // RUN: %clangxx %fsycl-host-only -fno-sycl-rdc -E -dD %s -o %t.host
 // RUN:                %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown -fsycl-device-only -E -dD -fno-sycl-rdc %s -o %t.device.spirv
 // RUN: %if cuda    %{ %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda    -fsycl-device-only -E -dD -fno-sycl-rdc %s -o %t.device.cuda %}
-// RUN: %if hip_amd %{ %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa      -fsycl-device-only -E -dD -fno-sycl-rdc %s -o %t.device.hip  %}
+// RUN: %if hip     %{ %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa      -fsycl-device-only -E -dD -fno-sycl-rdc %s -o %t.device.hip  %}
 //
 // RUN: FileCheck --match-full-lines %s < %t.host --check-prefixes=HOST
 // RUN:                FileCheck --match-full-lines %s < %t.device.spirv --check-prefixes=DEVICE-FULL-LINE --implicit-check-not="#define SYCL_EXTERNAL"
 // RUN: %if cuda    %{ FileCheck --match-full-lines %s < %t.device.cuda  --check-prefixes=DEVICE-FULL-LINE --implicit-check-not="#define SYCL_EXTERNAL" %}
-// RUN: %if hip_amd %{ FileCheck --match-full-lines %s < %t.device.hip   --check-prefixes=DEVICE-FULL-LINE --implicit-check-not="#define SYCL_EXTERNAL" %}
+// RUN: %if hip     %{ FileCheck --match-full-lines %s < %t.device.hip   --check-prefixes=DEVICE-FULL-LINE --implicit-check-not="#define SYCL_EXTERNAL" %}
 //
 // Remove __DPCPP_SYCL_EXTERNAL to simplify regex for DEVICE prefix
 // RUN:                sed 's|__DPCPP_SYCL_EXTERNAL||g' %t.device.spirv | FileCheck %s --check-prefixes=DEVICE
 // RUN: %if cuda    %{ sed 's|__DPCPP_SYCL_EXTERNAL||g' %t.device.cuda  | FileCheck %s --check-prefixes=DEVICE %}
-// RUN: %if hip_amd %{ sed 's|__DPCPP_SYCL_EXTERNAL||g' %t.device.hip   | FileCheck %s --check-prefixes=DEVICE %}
+// RUN: %if hip     %{ sed 's|__DPCPP_SYCL_EXTERNAL||g' %t.device.hip   | FileCheck %s --check-prefixes=DEVICE %}
 // RUN:
 //
 // With -fno-sycl-rdc, device code should not define or use SYCL_EXTERNAL
diff --git a/sycl/test/check_device_code/atomic_ref.cpp b/sycl/test/check_device_code/atomic_ref.cpp
index 79b12590e1a5d..648ae01ddb192 100644
--- a/sycl/test/check_device_code/atomic_ref.cpp
+++ b/sycl/test/check_device_code/atomic_ref.cpp
@@ -6,8 +6,8 @@
 // CHECK-LABEL: define dso_local spir_func noundef i32 @_Z17atomic_ref_globalRi(
 // CHECK-SAME: ptr addrspace(4) noundef align 4 dereferenceable(4) [[I:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[CALL_I_I_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4) noundef align 4 dereferenceable(4) [[I]], i32 noundef 5) #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    [[CALL3_I_I:%.*]] = tail call spir_func noundef i32 @_Z18__spirv_AtomicLoadPU3AS1KiN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagE(ptr addrspace(1) noundef [[CALL_I_I_I_I_I_I]], i32 noundef 1, i32 noundef 898) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[TMP:%.*]] = addrspacecast ptr addrspace(4) [[I]] to ptr addrspace(1)
+// CHECK-NEXT:    [[CALL3_I_I:%.*]] = tail call spir_func noundef i32 @_Z18__spirv_AtomicLoadPU3AS1KiN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagE(ptr addrspace(1) noundef [[TMP]], i32 noundef 1, i32 noundef 898) #[[ATTR4:[0-9]+]]
 // CHECK-NEXT:    ret i32 [[CALL3_I_I]]
 //
 SYCL_EXTERNAL auto atomic_ref_global(int &i) {
diff --git a/sycl/test/check_device_code/extensions/address_cast.cpp b/sycl/test/check_device_code/extensions/address_cast.cpp
index bc401e3c72fce..6b3bacc2c424d 100644
--- a/sycl/test/check_device_code/extensions/address_cast.cpp
+++ b/sycl/test/check_device_code/extensions/address_cast.cpp
@@ -13,29 +13,29 @@ using namespace sycl::ext::oneapi::experimental;
 
 namespace static_as_cast {
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast19to_global_decoratedEN4sycl3_V19multi_ptrIiLNS1_6access13address_spaceE6ELNS3_9decoratedE1EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P]], align 8, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4) noundef [[TMP1]], i32 noundef 5) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    store ptr addrspace(1) [[CALL_I_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA12:![0-9]+]], !alias.scope [[META14:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(1)
+// CHECK-NEXT:    store ptr addrspace(1) [[TMP2]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA12:![0-9]+]], !alias.scope [[META14:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto to_global_decorated(decorated_generic_ptr<int> p) {
   return static_address_cast<access::address_space::global_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast23to_global_not_decoratedEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.1") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META19:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.1") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z33__spirv_GenericCastToPtr_ToGlobalPvi(ptr addrspace(4) noundef [[P]], i32 noundef 5) #[[ATTR5]]
-// CHECK-NEXT:    store ptr addrspace(1) [[CALL_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope [[META22:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[P]] to ptr addrspace(1)
+// CHECK-NEXT:    store ptr addrspace(1) [[TMP0]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope [[META22:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto to_global_not_decorated(int *p) {
   return static_address_cast<access::address_space::global_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast20to_generic_decoratedEN4sycl3_V19multi_ptrIiLNS1_6access13address_spaceE6ELNS3_9decoratedE1EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.0") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !srcloc [[META25:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.0") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0")
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA8]], !alias.scope [[META26:![0-9]+]]
@@ -45,7 +45,7 @@ SYCL_EXTERNAL auto to_generic_decorated(decorated_generic_ptr<int> p) {
   return static_address_cast<access::address_space::generic_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast24to_generic_not_decoratedEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.2") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] !srcloc [[META29:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.2") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    store ptr addrspace(4) [[P]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA30:![0-9]+]], !alias.scope [[META32:![0-9]+]]
 // CHECK-NEXT:    ret void
@@ -55,7 +55,7 @@ SYCL_EXTERNAL auto to_generic_not_decorated(int *p) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast16to_global_deviceEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.3") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR4]] !srcloc [[META35:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.3") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[P]] to ptr addrspace(5)
 // CHECK-NEXT:    store ptr addrspace(5) [[TMP0]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA36:![0-9]+]], !alias.scope [[META38:![0-9]+]]
@@ -66,7 +66,7 @@ SYCL_EXTERNAL auto to_global_device(int *p) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_ZN14static_as_cast14to_global_hostEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.4") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR4]] !srcloc [[META41:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.4") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(4) [[P]] to ptr addrspace(6)
 // CHECK-NEXT:    store ptr addrspace(6) [[TMP0]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA42:![0-9]+]], !alias.scope [[META44:![0-9]+]]
@@ -79,11 +79,11 @@ SYCL_EXTERNAL auto to_global_host(int *p) {
 
 namespace dynamic_as_cast {
 // CHECK-LABEL: define dso_local spir_func void @_ZN15dynamic_as_cast19to_global_decoratedEN4sycl3_V19multi_ptrIiLNS1_6access13address_spaceE6ELNS3_9decoratedE1EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META47:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef [[TMP1]], i32 noundef 5) #[[ATTR5]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef [[TMP1]], i32 noundef 5)
 // CHECK-NEXT:    store ptr addrspace(1) [[CALL_I_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA12]], !alias.scope [[META48:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -91,9 +91,9 @@ SYCL_EXTERNAL auto to_global_decorated(decorated_generic_ptr<int> p) {
   return dynamic_address_cast<access::address_space::global_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN15dynamic_as_cast23to_global_not_decoratedEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.1") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META53:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.1") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef [[P]], i32 noundef 5) #[[ATTR5]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef [[P]], i32 noundef 5)
 // CHECK-NEXT:    store ptr addrspace(1) [[CALL_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA20]], !alias.scope [[META54:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
@@ -101,7 +101,7 @@ SYCL_EXTERNAL auto to_global_not_decorated(int *p) {
   return dynamic_address_cast<access::address_space::global_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN15dynamic_as_cast20to_generic_decoratedEN4sycl3_V19multi_ptrIiLNS1_6access13address_spaceE6ELNS3_9decoratedE1EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.0") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]]) local_unnamed_addr #[[ATTR3]] !srcloc [[META57:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.0") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::multi_ptr.0") align 8 [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P]], align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA8]], !alias.scope [[META58:![0-9]+]]
@@ -111,7 +111,7 @@ SYCL_EXTERNAL auto to_generic_decorated(decorated_generic_ptr<int> p) {
   return dynamic_address_cast<access::address_space::generic_space>(p);
 }
 // CHECK-LABEL: define dso_local spir_func void @_ZN15dynamic_as_cast24to_generic_not_decoratedEPi(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.2") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR4]] !srcloc [[META61:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::multi_ptr.2") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr addrspace(4) noundef [[P:%.*]])
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    store ptr addrspace(4) [[P]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA30]], !alias.scope [[META62:![0-9]+]]
 // CHECK-NEXT:    ret void
diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
index 12b8a21cc069e..a9f571c9225db 100644
--- a/sycl/test/check_device_code/native_cpu/vectorization.cpp
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -18,9 +18,9 @@ int main() {
   sycl::range<1> r(1);
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
-    // CHECK-DEFAULT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-    // CHECK-16: store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-    // CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+    // CHECK-DEFAULT: store <8 x i32> splat (i32 42)
+    // CHECK-16: store <16 x i32> splat (i32 42)
+    // CHECK-4: store <4 x i32> splat (i32 42)
     // CHECK-O0: store i32 42
     // CHECK-O0-NOT: store <{{.*}}>
     // CHECK-DISABLE: store i32 42
diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
index 1de08c8f0c495..2d42dd7f1fae9 100644
--- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
+++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
@@ -54,7 +54,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 415
+// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 414
 //
 // List of improperly UNSUPPORTED tests.
 // Remove the CHECK once the test has been properly UNSUPPORTED.
@@ -62,7 +62,6 @@
 // CHECK: AOT/early_aot.cpp
 // CHECK-NEXT: AOT/gpu.cpp
 // CHECK-NEXT: AOT/multiple-devices.cpp
-// CHECK-NEXT: Adapters/enqueue-arg-order-buffer.cpp
 // CHECK-NEXT: Adapters/enqueue-arg-order-image.cpp
 // CHECK-NEXT: Adapters/enqueue-arg-order-image.cpp
 // CHECK-NEXT: Adapters/interop-l0-direct.cpp
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index 788d9ab37a9e3..089395d5c1400 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -168,7 +168,7 @@
 
 if "amdgcn-amd-amdhsa" in triple:
     llvm_config.with_system_environment("ROCM_PATH")
-    config.available_features.add("hip_amd")
+    config.available_features.add("hip")
     # For AMD the specific GPU has to be specified with --offload-arch
     if not any([f.startswith("--offload-arch") for f in additional_flags]):
         # If the offload arch wasn't specified in SYCL_CLANG_EXTRA_FLAGS,