DPCT1085#

Message#

The function <function name> requires sub-group size to be <size>, while other sub-group functions in the same SYCL kernel require a different sub-group size. You may need to adjust the code.

Detailed Help#

Each kernel can only be decorated with one sub-group size. This warning is emitted when a kernel requires different sub-group sizes. Check if the sub-group size can be unified into one value, and if it cannot be unified, redesign the code logic.

For example, this original CUDA* code:

 1 __global__ void kernel(int* data1, int* data2) {
 2   typedef cub::WarpScan<int> WarpScan;
 3   typedef cub::WarpScan<int, 16> WarpScan16;
 4
 5   typename WarpScan::TempStorage temp1;
 6   typename WarpScan16::TempStorage temp2;
 7
 8   int input = data1[threadIdx.x];
 9   int output1 = 0;
10   int output2 = 0;
11   WarpScan(temp1).InclusiveSum(input, output1);
12   data1[threadIdx.x] = output1;
13   WarpScan16(temp2).InclusiveSum(input, output2);
14   data2[threadIdx.x] = output1;
15 }
16
17 void foo(int* data1, int* data2) {
18   kernel<<<1, 32>>>(data1, data2);
19 }

results in the following migrated SYCL* code:

 1 void kernel(int* data1, int* data2, const sycl::nd_item<3> &item_ct1) {
 2
 3   int input = data1[item_ct1.get_local_id(2)];
 4   int output1 = 0;
 5   int output2 = 0;
 6   output1 = sycl::inclusive_scan_over_group(item_ct1.get_sub_group(), input,
 7                                             sycl::plus<>());
 8   data1[item_ct1.get_local_id(2)] = output1;
 9   /*
10   DPCT1085:0: The function inclusive_scan_over_group requires sub-group size to
11   be 16, while other sub-group functions in the same SYCL kernel require a
12   different sub-group size. You may need to adjust the code.
13   */
14   output2 = sycl::inclusive_scan_over_group(item_ct1.get_sub_group(), input,
15                                             sycl::plus<>());
16   data2[item_ct1.get_local_id(2)] = output1;
17 }
18
19 void foo(int* data1, int* data2) {
20   dpct::get_in_order_queue().parallel_for(
21       sycl::nd_range<3>(sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
22       [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
23         kernel(data1, data2, item_ct1);
24       });
25 }
26
27 void foo(int* data) {
28   dpct::get_in_order_queue().parallel_for(
29       sycl::nd_range<3>(sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
30       [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
31         kernel(data, item_ct1);
32       });
33 }

which is manually adjusted to:

 1 void kernel(int* data1, int* data2, const sycl::nd_item<3> &item_ct1) {
 2
 3   int input = data1[item_ct1.get_local_id(2)];
 4   int output1 = 0;
 5   int output2 = 0;
 6   output1 = sycl::inclusive_scan_over_group(item_ct1.get_sub_group(), input,
 7                                             sycl::plus<>());
 8   data1[item_ct1.get_local_id(2)] = output1;
 9   output2 = sycl::inclusive_scan_over_group(item_ct1.get_sub_group(), input,
10                                             sycl::plus<>());
11   data2[item_ct1.get_local_id(2)] = output1;
12   item_ct1.barrier();
13   if (item_ct1.get_local_id(2) % 32 >= 16) {
14     int warp_id = item_ct1.get_local_id(2) / 32;
15     data2[item_ct1.get_local_id(2)] -= data2[warp_id * 32 + 15];
16   }
17 }
18
19 void foo(int* data1, int* data2) {
20   dpct::get_in_order_queue().parallel_for(
21       sycl::nd_range<3>(sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
22       [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
23         kernel(data1, data2, item_ct1);
24       });
25 }

Suggestions to Fix#

Code requires manual fix. Rewrite the code manually.