DPCT1091#
Message#
The function dpct::segmented_reduce
only supports DPC++ native binary operation. Replace “dpct_placeholder” with a DPC++ native binary operation.
Detailed Help#
dpct::segmented_reduce
supports the following native binary operations:
sycl::plus
sycl::bit_or
sycl::bit_xor
sycl::bit_and
sycl::maximum
sycl::minimum
sycl::multiplies
Suggestions to Fix#
Review and rewrite the code manually.
For example, this original CUDA* code:
1struct UserMin {
2 template <typename T>
3 __device__ __host__ __forceinline__ T operator()(const T &a,
4 const T &b) const {
5 return (b < a) ? b : a;
6 }
7};
8
9void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
10 UserMin min_op, int initial_value) {
11 size_t temp_storage_size;
12 void *temp_storage = nullptr;
13
14 cub::DeviceSegmentedReduce::Reduce(temp_storage, temp_storage_size, device_in,
15 device_out, num_segments, device_offsets,
16 device_offsets + 1, min_op, initial_value);
17
18 cudaMalloc(&temp_storage, temp_storage_size);
19
20 cub::DeviceSegmentedReduce::Reduce(temp_storage, temp_storage_size, device_in,
21 device_out, num_segments, device_offsets,
22 device_offsets + 1, min_op, initial_value);
23
24 cudaDeviceSynchronize();
25 cudaFree(temp_storage);
26}
results in the following migrated SYCL code:
1struct UserMin {
2 template <typename T>
3 __dpct_inline__ T operator()(const T &a, const T &b) const {
4 return (b < a) ? b : a;
5 }
6};
7
8void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
9 UserMin min_op, int initial_value) {
10 dpct::device_ext &dev_ct1 = dpct::get_current_device();
11 sycl::queue &q_ct1 = dev_ct1.in_order_queue();
12
13 /*
14 DPCT1026:0: The call to cub::DeviceSegmentedReduce::Reduce was removed because
15 this call is redundant in SYCL.
16 */
17
18 /*
19 DPCT1092:1: Consider replacing work-group size 128 with different value for
20 specific hardware for better performance.
21 */
22 /*
23 DPCT1091:2: The function dpct::segmented_reduce only supports DPC++ native
24 binary operation. Replace "dpct_placeholder" with a DPC++ native binary
25 operation.
26 */
27 dpct::device::segmented_reduce<128>(
28 q_ct1, device_in, device_out, num_segments, device_offsets,
29 device_offsets + 1, dpct_placeholder, initial_value);
30
31 dev_ct1.queues_wait_and_throw();
32}
which is rewritten to:
1void foo(int num_segments, int *device_offsets, int *device_in, int *device_out,
2 UserMin min_op, int initial_value) {
3 dpct::device_ext &dev_ct1 = dpct::get_current_device();
4 sycl::queue &q_ct1 = dev_ct1.in_order_queue();
5
6 int max_work_group_size = dev_ct1.get_max_work_group_size();
7 if (max_work_group_size >= 256)
8 dpct::device::segmented_reduce<256>(
9 q_ct1, device_in, device_out, num_segments, device_offsets,
10 device_offsets + 1, sycl::minimum(), initial_value);
11 else
12 dpct::device::segmented_reduce<128>(
13 q_ct1, device_in, device_out, num_segments, device_offsets,
14 device_offsets + 1, sycl::minimum(), initial_value);
15
16 dev_ct1.queues_wait_and_throw();
17}