DPCT1020#
Message#
Migration of <api name>, if it is called from __global__
or __device__
function, is not supported. You may need to redesign the code to use the host-side
<api name> instead, which submits this call to the SYCL queue automatically.
Detailed Help#
The warning message is generated in cases where the <api name> itself submits the SYCL* kernel to the command queue, and the caller of <api-name> is the SYCL kernel that is submitted to the command queue itself. It results in device-side enqueue of the kernel, which is not supported by SYCL 2020.
Suggestions to Fix#
Redesign the code to use the host-side API, which submits this call to the SYCL queue automatically.
For example, this original CUDA* code:
1__global__ void kernel(float *d_data) {
2 int tid = threadIdx.x;
3 d_data[tid + 1] = tid;
4
5 __syncthreads();
6
7 if (tid == 0) {
8 cublasHandle_t handle;
9 cublasCreate(&handle);
10 cublasSasum(handle, 128, d_data + 1, 1, d_data)
11 cublasDestroy(handle);
12 }
13}
14
15void foo() {
16 float *d_data;
17 cudaMalloc((void **)&d_data, sizeof(float) * (1 + 128));
18 kernel<<<1, 128>>>(d_data);
19
20 float data;
21 cudaMemcpy(data, d_data, sizeof(float), cudaMemcpyDeviceToHost);
22 cudaFree(d_data);
23}
results in the following migrated SYCL code:
1void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
2 int tid = item_ct1.get_local_id(2);
3 d_data[tid + 1] = tid;
4
5 item_ct1.barrier();
6
7 if (tid == 0) {
8 /*
9 DPCT1021:2: Migration of cublasHandle_t in __global__ or __device__ function
10 is not supported. You may need to redesign the code.
11 */
12 cublasHandle_t handle;
13 handle = &dpct::get_default_queue();
14 /*
15 DPCT1020:1: Migration of cublasSasum, if it is called from __global__ or
16 __device__ function, is not supported. You may need to redesign the code to
17 use the host-side oneapi::mkl::blas::column_major::asum instead, which submits
18 this call to the SYCL queue automatically.
19 */
20 cublasSasum(handle, 128, d_data + 1, 1, d_data);
21 handle = nullptr;
22 }
23}
24
25void foo() {
26 dpct::device_ext &dev_ct1 = dpct::get_current_device();
27 sycl::queue &q_ct1 = dev_ct1.default_queue();
28 float *d_data;
29 d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
30 q_ct1.parallel_for(
31 sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
32 [=](sycl::nd_item<3> item_ct1) {
33 kernel(d_data, item_ct1);
34 });
35
36 float data;
37 q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
38 sycl::free(d_data, q_ct1);
39}
which is rewritten to:
1void kernel(float *d_data, sycl::nd_item<3> item_ct1) {
2 int tid = item_ct1.get_local_id(2);
3 d_data[tid + 1] = tid;
4}
5
6void foo() {
7 dpct::device_ext &dev_ct1 = dpct::get_current_device();
8 sycl::queue &q_ct1 = dev_ct1.default_queue();
9 float *d_data;
10 d_data = sycl::malloc_device<float>((1 + 128), q_ct1);
11 q_ct1.parallel_for(
12 sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
13 [=](sycl::nd_item<3> item_ct1) {
14 kernel(d_data, item_ct1);
15 });
16 oneapi::mkl::blas::column_major::asum(q_ct1, 128, d_data + 1, 1, d_data);
17
18 float data;
19 q_ct1.memcpy(&data, d_data, sizeof(float)).wait();
20 sycl::free(d_data, q_ct1);
21}