DPCT1113#

Message#

Consider replacing sycl::nd_item::barrier(sycl::access::fence_space::local_space) with sycl::nd_item::barrier() if function <function name> is called in a multidimensional kernel.

Detailed Help#

If there is no overlap global device memory access among each work-item in a work-group, the __syncthreads() API calls in kernel with 1D index space of a kernel execution that can be migrated to sycl::nd_item::barrier(sycl::access::fence_space::local_space). If you call the kernel with 2D/3D index space, the global memory access from each work-item in a work-group may overlap and may result in data dependency between work-items across the barrier. If this is the case, you may need to update the sycl::nd_item::barrier() call with sycl::access::fence_space::global_and_local.

Suggestions to Fix#

For example, this original CUDA* code:

 1__global__ void kernel(float *mem) {
 2  unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
 3  ...
 4  mem[tid] = ...;
 5  __syncthreads();
 6  ...
 7  ... = mem[tid];
 8}
 9
10void foo(float *mem) {
11  kernel<<<16, 16>>>(mem); //1D index space of a SYCL kernel execution
12}

results in the following migrated SYCL* code:

 1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
 2  unsigned int tid = item_ct1.get_local_id(2) +
 3                     item_ct1.get_local_range(2) * item_ct1.get_group(2);
 4  ...
 5  mem[tid] = ...; // global memory access without overlap among each work-item in a work-group
 6  /*
 7  DPCT1113:0: Consider replacing
 8  sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
 9  sycl::nd_item::barrier() if function "kernel" is called in a multidimensional
10  kernel.
11  */
12  item_ct1.barrier(sycl::access::fence_space::local_space);
13  ...
14  ... = mem[tid]; // global memory access without overlap among each work-item in a work-group
15}
16
17void foo(float *mem) {
18  dpct::get_default_queue().parallel_for(
19      sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16),
20                        sycl::range<3>(1, 1, 16)),
21      [=](sycl::nd_item<3> item_ct1) {
22        kernel(mem, item_ct1);
23      });
24}

After migration, update the migrated SYCL kernel code to a 2D kernel:

 1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
 2  unsigned int tidx = item_ct1.get_local_id(2) +
 3                      item_ct1.get_local_range(2) * item_ct1.get_group(2);
 4  unsigned int tidy = item_ct1.get_local_id(1) +
 5                      item_ct1.get_local_range(1) * item_ct1.get_group(1);
 6
 7  ...
 8  mem[tidx] = ...; // global memory access with overlap among each work-item in a work-group
 9  mem[tidy] = ...; // global memory access with overlap among each work-item in a work-group
10  /*
11  DPCT1113:0: Consider replacing
12  sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
13  sycl::nd_item::barrier() if function "kernel" is called in a multidimensional
14  kernel.
15  */
16  item_ct1.barrier(sycl::access::fence_space::local_space);
17  ...
18  ... = mem[tidx]; // global memory access with overlap among each work-item in a work-group
19  ... = mem[tidy]; // global memory access with overlap among each work-item in a work-group
20}
21
22void foo(float *mem) {
23  dpct::get_default_queue().parallel_for(
24      sycl::nd_range<3>(sycl::range<3>(1, 4, 4) * sycl::range<3>(1, 4, 4),
25                        sycl::range<3>(1, 4, 4)), /*2D  index space of a SYCL kernel execution */
26
27      [=](sycl::nd_item<3> item_ct1) {
28        kernel(mem, item_ct1);
29      });
30}

which is rewritten to:

 1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
 2  unsigned int tidx = item_ct1.get_local_id(2) +
 3                      item_ct1.get_local_range(2) * item_ct1.get_group(2);
 4  unsigned int tidy = item_ct1.get_local_id(1) +
 5                      item_ct1.get_local_range(1) * item_ct1.get_group(1);
 6
 7  ...
 8  mem[tidx] = ...; // global memory access with overlap among each work-item in a work-group
 9  mem[tidy] = ...; // global memory access with overlap among each work-item in a work-group
10  item_ct1.barrier(sycl::access::fence_space::global_and_local);
11  ...
12  ... = mem[tidx]; // global memory access with overlap among each work-item in a work-group
13  ... = mem[tidy]; // global memory access with overlap among each work-item in a work-group
14}
15
16void foo(float *mem) {
17  dpct::get_default_queue().parallel_for(
18      sycl::nd_range<3>(sycl::range<3>(1, 4, 4) * sycl::range<3>(1, 4, 4),
19                        sycl::range<3>(1, 4, 4)),
20      [=](sycl::nd_item<3> item_ct1) {
21        kernel(mem, item_ct1);
22      });
23}