DPCT1087#
Message#
SYCL currently does not support cross group synchronization. You can specify
--use-experimental-features=nd_range_barrier
to use the dpct helper function
nd_range_barrier
to migrate <synchronization API call>.
Detailed Help#
By default, the dpct helper function nd_range_barrier
is not used to migrate
CUDA* grid level synchronization. To use nd_range_barrier
to migrate CUDA
grid level synchronization, specify --use-experimental-features=nd_range_barrier
in the migration command.
Suggestions to Fix#
Specify --use-experimental-features=nd_range_barrier
in the migration command
to use dpct helper function nd_range_barrier
to migrate CUDA grid level
synchronization.
For example, this original CUDA* code:
1__global__ void kernel() {
2 namespace cg = cooperative_groups;
3 cg::grid_group grid = cg::this_grid();
4 grid.sync();
5}
6
7void foo() {
8 kernel<<<1, 64>>>();
9}
results in the following migrated SYCL code:
1void kernel() {
2
3 /*
4 DPCT1087:1: SYCL currently does not support cross group synchronization. You
5 can specify "--use-experimental-features=nd_range_barrier" to use the dpct
6 helper function nd_range_barrier to migrate this_grid().
7 */
8 cg::grid_group grid = cg::this_grid();
9 /*
10 DPCT1087:0: SYCL currently does not support cross group synchronization. You
11 can specify "--use-experimental-features=nd_range_barrier" to use the dpct
12 helper function nd_range_barrier to migrate grid.sync().
13 */
14 grid.sync();
15}
16
17void foo() {
18 dpct::get_in_order_queue().parallel_for(
19 sycl::nd_range<3>(sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
20 [=](sycl::nd_item<3> item_ct1) {
21 kernel();
22 });
23}
which is rewritten to:
1void kernel(const sycl::nd_item<3> &item_ct1,
2 sycl::atomic_ref<unsigned int, sycl::memory_order::seq_cst, sycl::memory_scope::device, sycl::access::address_space::global_space> &sync_ct1) {
3
4 dpct::experimental::nd_range_barrier(item_ct1, sync_ct1);
5}
6
7void foo() {
8 dpct::global_memory<unsigned int, 0> d_sync_ct1(0);
9 unsigned *sync_ct1 = d_sync_ct1.get_ptr(dpct::get_in_order_queue());
10 dpct::get_in_order_queue().memset(sync_ct1, 0, sizeof(int)).wait();
11 dpct::get_in_order_queue()
12 .parallel_for(
13 sycl::nd_range<3>(sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
14 [=](sycl::nd_item<3> item_ct1) {
15 auto atm_sync_ct1 =
16 sycl::atomic_ref<unsigned int, sycl::memory_order::seq_cst,
17 sycl::memory_scope::device,
18 sycl::access::address_space::global_space>(
19 sync_ct1[0]);
20 kernel(item_ct1, atm_sync_ct1);
21 })
22 .wait();
23}