DPCT1113#
Message#
Consider replacing sycl::nd_item::barrier(sycl::access::fence_space::local_space)
with sycl::nd_item::barrier()
if function <function name> is called in a
multidimensional kernel.
Detailed Help#
If there is no overlap global device memory access among each work-item in a
work-group, the __syncthreads()
API calls in kernel with 1D index space of a
kernel execution that can be migrated to sycl::nd_item::barrier(sycl::access::fence_space::local_space)
.
If you call the kernel with 2D/3D index space, the global memory access
from each work-item in a work-group may overlap and may result in data
dependency between work-items across the barrier. If this is the case, you may
need to update the sycl::nd_item::barrier()
call with
sycl::access::fence_space::global_and_local
.
Suggestions to Fix#
For example, this original CUDA* code:
1__global__ void kernel(float *mem) {
2 unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
3 ...
4 mem[tid] = ...;
5 __syncthreads();
6 ...
7 ... = mem[tid];
8}
9
10void foo(float *mem) {
11 kernel<<<16, 16>>>(mem); //1D index space of a SYCL kernel execution
12}
results in the following migrated SYCL* code:
1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
2 unsigned int tid = item_ct1.get_local_id(2) +
3 item_ct1.get_local_range(2) * item_ct1.get_group(2);
4 ...
5 mem[tid] = ...; // global memory access without overlap among each work-item in a work-group
6 /*
7 DPCT1113:0: Consider replacing
8 sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
9 sycl::nd_item::barrier() if function "kernel" is called in a multidimensional
10 kernel.
11 */
12 item_ct1.barrier(sycl::access::fence_space::local_space);
13 ...
14 ... = mem[tid]; // global memory access without overlap among each work-item in a work-group
15}
16
17void foo(float *mem) {
18 dpct::get_default_queue().parallel_for(
19 sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16),
20 sycl::range<3>(1, 1, 16)),
21 [=](sycl::nd_item<3> item_ct1) {
22 kernel(mem, item_ct1);
23 });
24}
After migration, update the migrated SYCL kernel code to a 2D kernel:
1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
2 unsigned int tidx = item_ct1.get_local_id(2) +
3 item_ct1.get_local_range(2) * item_ct1.get_group(2);
4 unsigned int tidy = item_ct1.get_local_id(1) +
5 item_ct1.get_local_range(1) * item_ct1.get_group(1);
6
7 ...
8 mem[tidx] = ...; // global memory access with overlap among each work-item in a work-group
9 mem[tidy] = ...; // global memory access with overlap among each work-item in a work-group
10 /*
11 DPCT1113:0: Consider replacing
12 sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
13 sycl::nd_item::barrier() if function "kernel" is called in a multidimensional
14 kernel.
15 */
16 item_ct1.barrier(sycl::access::fence_space::local_space);
17 ...
18 ... = mem[tidx]; // global memory access with overlap among each work-item in a work-group
19 ... = mem[tidy]; // global memory access with overlap among each work-item in a work-group
20}
21
22void foo(float *mem) {
23 dpct::get_default_queue().parallel_for(
24 sycl::nd_range<3>(sycl::range<3>(1, 4, 4) * sycl::range<3>(1, 4, 4),
25 sycl::range<3>(1, 4, 4)), /*2D index space of a SYCL kernel execution */
26
27 [=](sycl::nd_item<3> item_ct1) {
28 kernel(mem, item_ct1);
29 });
30}
which is rewritten to:
1void kernel(float *mem, const sycl::nd_item<3> &item_ct1) {
2 unsigned int tidx = item_ct1.get_local_id(2) +
3 item_ct1.get_local_range(2) * item_ct1.get_group(2);
4 unsigned int tidy = item_ct1.get_local_id(1) +
5 item_ct1.get_local_range(1) * item_ct1.get_group(1);
6
7 ...
8 mem[tidx] = ...; // global memory access with overlap among each work-item in a work-group
9 mem[tidy] = ...; // global memory access with overlap among each work-item in a work-group
10 item_ct1.barrier(sycl::access::fence_space::global_and_local);
11 ...
12 ... = mem[tidx]; // global memory access with overlap among each work-item in a work-group
13 ... = mem[tidy]; // global memory access with overlap among each work-item in a work-group
14}
15
16void foo(float *mem) {
17 dpct::get_default_queue().parallel_for(
18 sycl::nd_range<3>(sycl::range<3>(1, 4, 4) * sycl::range<3>(1, 4, 4),
19 sycl::range<3>(1, 4, 4)),
20 [=](sycl::nd_item<3> item_ct1) {
21 kernel(mem, item_ct1);
22 });
23}