DPCT1087#

Message#

SYCL currently does not support cross group synchronization. You can specify --use-experimental-features=nd_range_barrier to use the dpct helper function nd_range_barrier to migrate <synchronization API call>.

Detailed Help#

By default, the dpct helper function nd_range_barrier is not used to migrate CUDA* grid level synchronization. To use nd_range_barrier to migrate CUDA grid level synchronization, specify --use-experimental-features=nd_range_barrier in the migration command.

Suggestions to Fix#

Specify --use-experimental-features=nd_range_barrier in the migration command to use dpct helper function nd_range_barrier to migrate CUDA grid level synchronization.

For example, this original CUDA* code:

1__global__ void kernel() {
2  namespace cg = cooperative_groups;
3  cg::grid_group grid = cg::this_grid();
4  grid.sync();
5}
6
7void foo() {
8  kernel<<<1, 64>>>();
9}

results in the following migrated SYCL code:

 1void kernel() {
 2
 3  /*
 4  DPCT1087:1: SYCL currently does not support cross group synchronization. You
 5  can specify "--use-experimental-features=nd_range_barrier" to use the dpct
 6  helper function nd_range_barrier to migrate this_grid().
 7  */
 8  cg::grid_group grid = cg::this_grid();
 9  /*
10  DPCT1087:0: SYCL currently does not support cross group synchronization. You
11  can specify "--use-experimental-features=nd_range_barrier" to use the dpct
12  helper function nd_range_barrier to migrate grid.sync().
13  */
14  grid.sync();
15}
16
17void foo() {
18  dpct::get_in_order_queue().parallel_for(
19      sycl::nd_range<3>(sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
20      [=](sycl::nd_item<3> item_ct1) {
21        kernel();
22      });
23}

which is rewritten to:

 1void kernel(const sycl::nd_item<3> &item_ct1,
 2            sycl::atomic_ref<unsigned int, sycl::memory_order::seq_cst, sycl::memory_scope::device, sycl::access::address_space::global_space> &sync_ct1) {
 3
 4  dpct::experimental::nd_range_barrier(item_ct1, sync_ct1);
 5}
 6
 7void foo() {
 8  dpct::global_memory<unsigned int, 0> d_sync_ct1(0);
 9  unsigned *sync_ct1 = d_sync_ct1.get_ptr(dpct::get_in_order_queue());
10  dpct::get_in_order_queue().memset(sync_ct1, 0, sizeof(int)).wait();
11  dpct::get_in_order_queue()
12      .parallel_for(
13          sycl::nd_range<3>(sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
14          [=](sycl::nd_item<3> item_ct1) {
15            auto atm_sync_ct1 =
16                sycl::atomic_ref<unsigned int, sycl::memory_order::seq_cst,
17                                 sycl::memory_scope::device,
18                                 sycl::access::address_space::global_space>(
19                    sync_ct1[0]);
20            kernel(item_ct1, atm_sync_ct1);
21          })
22      .wait();
23}