DPCT1020#

Message#

Migration of <api name>, if it is called from __global__ or __device__ function, is not supported. You may need to redesign the code to use the host-side <api name> instead, which submits this call to the SYCL queue automatically.

Detailed Help#

The warning message is generated in cases where the <api name> itself submits the SYCL* kernel to the command queue, and the caller of <api-name> is the SYCL kernel that is submitted to the command queue itself. It results in device-side enqueue of the kernel, which is not supported by SYCL 2020.

Suggestions to Fix#

Redesign the code to use the host-side API, which submits this call to the SYCL queue automatically.

For example, this original CUDA* code:

 1__global__ void kernel(float *d_data) {
 2  int tid = threadIdx.x;
 3  d_data[tid + 1] = tid;
 4
 5  __syncthreads();
 6
 7  if (tid == 0) {
 8    cublasHandle_t handle;
 9    cublasCreate(&handle);
10    cublasSasum(handle, 128, d_data + 1, 1, d_data)
11    cublasDestroy(handle);
12  }
13}
14
15void foo() {
16  float *d_data;
17  cudaMalloc((void **)&d_data, sizeof(float) * (1 + 128));
18  kernel<<<1, 128>>>(d_data);
19
20  float data;
21  cudaMemcpy(data, d_data, sizeof(float), cudaMemcpyDeviceToHost);
22  cudaFree(d_data);
23}

results in the following migrated SYCL code:

 1void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
 2  int tid = item_ct1.get_local_id(2);
 3  d_data[tid + 1] = tid;
 4
 5  item_ct1.barrier();
 6
 7  if (tid == 0) {
 8    /*
 9    DPCT1021:2: Migration of cublasHandle_t in __global__ or __device__ function
10    is not supported. You may need to redesign the code.
11    */
12    cublasHandle_t handle;
13    handle = &dpct::get_default_queue();
14    /*
15    DPCT1020:1: Migration of cublasSasum, if it is called from __global__ or
16    __device__ function, is not supported. You may need to redesign the code to
17    use the host-side oneapi::mkl::blas::column_major::asum instead, which submits
18    this call to the SYCL queue automatically.
19    */
20    cublasSasum(handle, 128, d_data + 1, 1, d_data);
21    handle = nullptr;
22  }
23}
24
25void foo() {
26  dpct::device_ext &dev_ct1 = dpct::get_current_device();
27  sycl::queue &q_ct1 = dev_ct1.default_queue();
28  float *d_data;
29  d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
30  q_ct1.parallel_for(
31      sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
32      [=](sycl::nd_item<3> item_ct1) {
33        kernel(d_data, item_ct1);
34      });
35
36  float data;
37  q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
38  sycl::free(d_data, q_ct1);
39}

which is rewritten to:

 1void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
 2  int tid = item_ct1.get_local_id(2);
 3  d_data[tid + 1] = tid;
 4}
 5
 6void foo() {
 7  dpct::device_ext &dev_ct1 = dpct::get_current_device();
 8  sycl::queue &q_ct1 = dev_ct1.default_queue();
 9  float *d_data;
10  d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
11  q_ct1.parallel_for(
12      sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
13      [=](sycl::nd_item<3> item_ct1) {
14        kernel(d_data, item_ct1);
15      });
16  oneapi::mkl::blas::column_major::asum(q_ct1, 128, d_data + 1, 1, d_data);
17
18  float data;
19  q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
20  sycl::free(d_data, q_ct1);
21}