DPCT1065#

Message#

Consider replacing sycl::<...>::barrier() with sycl::<...>::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.

Detailed Help#

The function sycl::<...>::barrier() ensures correct memory access ordering in the global and local address space. If the kernel function has no memory accesses in the global memory, it is safe to replace sycl::<...>::barrier() with sycl::<...>::barrier(sycl::access::fence_space::local_space) for better performance.

Suggestions to Fix#

Replace sycl::<...>::barrier() with sycl::<...>::barrier(sycl::access::fence_space::local_space).

For example, this original CUDA* code:

 1struct Data_t {
 2  float *host_data;
 3  float *device_data;
 4};
 5
 6__global__ void k(Data_t *data) {
 7  auto tid = threadIdx.x + blockDim.x * blockIdx.x;
 8  only_read_data(data[tid].device_data);
 9  __syncthreads();
10  only_read_data(data[tid].device_data);
11}

results in the following migrated SYCL* code:

 1struct Data_t {
 2  float *host_data;
 3  float *device_data;
 4};
 5
 6void k(Data_t *data, const sycl::nd_item<3> &item_ct1) {
 7  auto tid = item_ct1.get_local_id(2) +
 8             item_ct1.get_local_range(2) * item_ct1.get_group(2);
 9  only_read_data(data[tid].device_data);
10  /*
11  DPCT1065:0: Consider replacing sycl::nd_item::barrier() with
12  sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
13  performance if there is no access to global memory.
14  */
15  item_ct1.barrier();
16  only_read_data(data[tid].device_data);
17}

which is rewritten to:

 1struct Data_t {
 2  float *host_data;
 3  float *device_data;
 4};
 5
 6void k(Data_t *data, const sycl::nd_item<3> &item_ct1) {
 7  auto tid = item_ct1.get_local_id(2) +
 8             item_ct1.get_local_range(2) * item_ct1.get_group(2);
 9  only_read_data(data[tid].device_data);
10  // global_local_space can be replaced with local_space if the access
11  // of the global memory after the barrier does not depend on (read-after-write or
12  // write-after-read or write-after-write) the access of the same global memory
13  // before the barrier among work-items in the current work-group.
14  item_ct1.barrier(sycl::access::fence_space::local_space);
15  only_read_data(data[tid].device_data);
16}