DPCT1115#

Message#

The sycl::ext::oneapi::group_local_memory_for_overwrite is used to allocate group-local memory at the none kernel functor scope of a work-group data parallel kernel. You may need to adjust the code.

Detailed Help#

The sycl::ext::oneapi::group_local_memory_for_overwrite can be used to allocate group-local memory at the kernel functor scope of a work-group data parallel kernel. The restriction that group-local variables must be defined at kernel functor scope may be lifted in a future version of this extension.

Refer to sycl_ext_oneapi_local_memory.asciidoc for more details.

Suggestions to Fix#

For example, this original CUDA* code:

 1template <int S> __device__ void devfun() {
 2  __shared__ int slm1[32 * S];
 3  ...
 4}
 5
 6template <int S> __global__ void kernel() {
 7  __shared__ int slm2[S];
 8  devfun<S>();
 9}
10
11void hostfun() { kernel<256><<<1, 1>>>(); }

results in the following migrated SYCL* code:

 1template <int S> inline void devfun(int *p, const sycl::nd_item<3> &item_ct1) {
 2  /*
 3  DPCT1115:0: The sycl::ext::oneapi::group_local_memory_for_overwrite is used to allocate
 4  group-local memory at the none kernel functor scope of a work-group data
 5  parallel kernel. You may need to adjust the code.
 6  */
 7  auto &slm1 =
 8      *sycl::ext::oneapi::group_local_memory_for_overwrite<int[32 * S]>(item_ct1.get_group());
 9  ...
10}
11
12template <int S> __dpct_inline__ void kernel(const sycl::nd_item<3> &item_ct1) {
13  auto &slm2 =
14      *sycl::ext::oneapi::group_local_memory_for_overwrite<int[S]>(item_ct1.get_group());
15  devfun<S>(item_ct1);
16}
17
18void hostfun() { dpct::get_default_queue().parallel_for(
19  sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
20  [=](sycl::nd_item<3> item_ct1) {
21    kernel<256>(item_ct1);
22  });
23}

which is rewritten to:

 1template <int S> inline void devfun(int *slm1) {
 2  ...
 3}
 4
 5template <int S> __dpct_inline__ void kernel(int *slm1, int *slm2) {
 6
 7  devfun<S>(slm1);
 8}
 9
10void hostfun() { dpct::get_default_queue().submit(
11  [&](sycl::handler &cgh) {
12    sycl::local_accessor<int, 1> slm1_acc_ct1(sycl::range<1>(32 * 256), cgh);
13    sycl::local_accessor<int, 1> slm2_acc_ct1(sycl::range<1>(256), cgh);
14
15    cgh.parallel_for(
16      sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
17      [=](sycl::nd_item<3> item_ct1) {
18        kernel<256>(slm1_acc_ct1.get_pointer(), slm2_acc_ct1.get_pointer());
19      });
20  });