cross_engine_reorder.cpp¶
This C++ API example demonstrates programming flow when reordering memory between CPU and GPU engines. Annotated version: Reorder between CPU and GPU engines
This C++ API example demonstrates programming flow when reordering memory between CPU and GPU engines. Annotated version: Reorder between CPU and GPU engines
/******************************************************************************* * Copyright 2019-2020 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include <iostream> #include <stdexcept> #include <vector> // [Prologue] #include "example_utils.hpp" #include "oneapi/dnnl/dnnl.hpp" #include "example_utils.hpp" using namespace dnnl; using namespace std; // [Prologue] void fill(memory &mem, const memory::dims &adims) { std::vector<float> array(product(adims)); for (size_t e = 0; e < array.size(); ++e) { array[e] = e % 7 ? 1.0f : -1.0f; } write_to_dnnl_memory(array.data(), mem); } int find_negative(memory &mem, const memory::dims &adims) { int negs = 0; size_t nelems = product(adims); std::vector<float> array(nelems); read_from_dnnl_memory(array.data(), mem); for (size_t e = 0; e < nelems; ++e) negs += array[e] < 0.0f; return negs; } void cross_engine_reorder_tutorial() { // [Initialize engine] auto cpu_engine = engine(validate_engine_kind(engine::kind::cpu), 0); auto gpu_engine = engine(validate_engine_kind(engine::kind::gpu), 0); // [Initialize engine] // [Initialize stream] auto stream_gpu = stream(gpu_engine, stream::flags::in_order); // [Initialize stream] // [reorder cpu2gpu] const auto tz = memory::dims {2, 16, 1, 1}; auto m_cpu = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw}, cpu_engine); auto m_gpu = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw}, gpu_engine); fill(m_cpu, tz); auto r1 = reorder(m_cpu, m_gpu); // [reorder cpu2gpu] // [Create a ReLU primitive] // ReLU op descriptor (uses a GPU memory as source memory. // no engine- or implementation-specific information) auto relu_d = eltwise_forward::desc(prop_kind::forward, algorithm::eltwise_relu, m_gpu.get_desc(), 0.0f); // ReLU primitive descriptor, which corresponds to a particular // implementation in the library. Specify engine type for the ReLU // primitive. Use a GPU engine here. auto relu_pd = eltwise_forward::primitive_desc(relu_d, gpu_engine); // ReLU primitive auto relu = eltwise_forward(relu_pd); // [Create a ReLU primitive] // [reorder gpu2cpu] auto r2 = reorder(m_gpu, m_cpu); // [reorder gpu2cpu] // [Execute primitives] // wrap source data from CPU to GPU r1.execute(stream_gpu, m_cpu, m_gpu); // Execute ReLU on a GPU stream relu.execute(stream_gpu, {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}}); // Get result data from GPU to CPU r2.execute(stream_gpu, m_gpu, m_cpu); stream_gpu.wait(); // [Execute primitives] // [Check the results] if (find_negative(m_cpu, tz) != 0) throw std::logic_error( "Unexpected output, find a negative value after the ReLU " "execution."); // [Check the results] } int main(int argc, char **argv) { return handle_example_errors({engine::kind::cpu, engine::kind::gpu}, cross_engine_reorder_tutorial); }