Kernel Programming Model

The device code can specify the amount of parallelism to request through several mechanisms.

  • single_task – execute a single instance of the kernel with a single work item.

  • parallel_for – execute a kernel in parallel across a range of processing elements. Typically, this version of parallel_for is employed on “embarrassingly parallel” workloads.

  • parallel_for_work_group – execute a kernel in parallel across a hierarchical range of processing elements using local memory and barriers.

The following code sample shows two combinations of invoking kernels:

  1. single_task and C++ lambda (lines 32-34)

  2. parallel_for and functor (lines 8-16 and line 46)

 1#include <array>
 2#include <CL/sycl.hpp>
 3
 4const int SIZE = 1024;
 5
 6using namespace sycl;
 7
 8class Vassign {
 9  accessor<int, 1, access::mode::read_write,
10	   access::target::global_buffer> access;
11
12public:
13  Vassign(accessor<int, 1, access::mode::read_write,
14	  access::target::global_buffer> &access_) : access(access_) {}
15  void operator()(id<1> id) const { access[id] = 1; }
16};
17
18int main() {
19  std::array<int, SIZE> a;
20
21  for (int i = 0; i<SIZE; ++i) {
22    a[i] = i;
23  }
24
25  {
26    range<1> a_size{SIZE};
27    buffer<int>  a_device(a.data(), a_size);
28    queue q;
29
30    q.submit([&](handler &h) {
31	auto a_in = a_device.get_access<access::mode::write>(h);
32	h.single_task([=]() {
33	    a_in[0] = 2;
34	  });
35      });
36  }
37
38  {
39    range<1> a_size{SIZE};
40    buffer<int>  a_device(a.data(), a_size);
41    queue q;
42    q.submit([&](handler &h) {
43	auto a_in = a_device.get_access<access::mode::read_write,
44					access::target::global_buffer>(h);
45	Vassign F(a_in);
46	h.parallel_for(range<1>(SIZE), F);
47      });
48  }
49} 
50