DPCT1091#

Message#

The function dpct::segmented_reduce only supports DPC++ native binary operation. Replace “dpct_placeholder” with a DPC++ native binary operation.

Detailed Help#

dpct::segmented_reduce supports the following native binary operations:

  • sycl::plus

  • sycl::bit_or

  • sycl::bit_xor

  • sycl::bit_and

  • sycl::maximum

  • sycl::minimum

  • sycl::multiplies

Suggestions to Fix#

Review and rewrite the code manually.

For example, this original CUDA* code:

 1struct UserMin {
 2  template <typename T>
 3  __device__ __host__ __forceinline__ T operator()(const T &a,
 4                                                   const T &b) const {
 5    return (b < a) ? b : a;
 6  }
 7};
 8
 9void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
10         UserMin min_op, int initial_value) {
11  size_t temp_storage_size;
12  void *temp_storage = nullptr;
13
14  cub::DeviceSegmentedReduce::Reduce(temp_storage, temp_storage_size, device_in,
15                                     device_out, num_segments, device_offsets,
16                                     device_offsets + 1, min_op, initial_value);
17
18  cudaMalloc(&temp_storage, temp_storage_size);
19
20  cub::DeviceSegmentedReduce::Reduce(temp_storage, temp_storage_size, device_in,
21                                     device_out, num_segments, device_offsets,
22                                     device_offsets + 1, min_op, initial_value);
23
24  cudaDeviceSynchronize();
25  cudaFree(temp_storage);
26}

results in the following migrated SYCL code:

 1struct UserMin {
 2  template <typename T>
 3  __dpct_inline__ T operator()(const T &a, const T &b) const {
 4    return (b < a) ? b : a;
 5  }
 6};
 7
 8void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
 9         UserMin min_op, int initial_value) {
10  dpct::device_ext &dev_ct1 = dpct::get_current_device();
11  sycl::queue &q_ct1 = dev_ct1.in_order_queue();
12
13  /*
14  DPCT1026:0: The call to cub::DeviceSegmentedReduce::Reduce was removed because
15  this call is redundant in SYCL.
16  */
17
18  /*
19  DPCT1092:1: Consider replacing work-group size 128 with different value for
20  specific hardware for better performance.
21  */
22  /*
23  DPCT1091:2: The function dpct::segmented_reduce only supports DPC++ native
24  binary operation. Replace "dpct_placeholder" with a DPC++ native binary
25  operation.
26  */
27  dpct::device::segmented_reduce<128>(
28      q_ct1, device_in, device_out, num_segments, device_offsets,
29      device_offsets + 1, dpct_placeholder, initial_value);
30
31  dev_ct1.queues_wait_and_throw();
32}

which is rewritten to:

 1void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
 2         UserMin min_op, int initial_value) {
 3  dpct::device_ext &dev_ct1 = dpct::get_current_device();
 4  sycl::queue &q_ct1 = dev_ct1.in_order_queue();
 5
 6  int max_work_group_size = dev_ct1.get_max_work_group_size();
 7  if (max_work_group_size >= 256)
 8    dpct::device::segmented_reduce<256>(
 9        q_ct1, device_in, device_out, num_segments, device_offsets,
10        device_offsets + 1, sycl::minimum(), initial_value);
11  else
12    dpct::device::segmented_reduce<128>(
13        q_ct1, device_in, device_out, num_segments, device_offsets,
14        device_offsets + 1, sycl::minimum(), initial_value);
15
16  dev_ct1.queues_wait_and_throw();
17}