.. index:: pair: example; performance_profiling.cpp .. _doxid-performance_profiling_8cpp-example: performance_profiling.cpp ========================= This example demonstrates the best practices for application performance optimizations with oneDNN. Annotated version: :ref:`Performance Profiling Example ` This example demonstrates the best practices for application performance optimizations with oneDNN. Annotated version: :ref:`Performance Profiling Example ` .. ref-code-block:: cpp /******************************************************************************* * Copyright 2019-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include #include #include #include "oneapi/dnnl/dnnl.hpp" #include "example_utils.hpp" using namespace :ref:`dnnl `; // [Prologue] // Set Strides and Padding const :ref:`memory::dims ` strides = {4, 4}; const :ref:`memory::dims ` padding = {0, 0}; // [Prologue] // // function to init data void init_data(:ref:`memory ` &m, float v) { size_t size = m.:ref:`get_desc `().:ref:`get_size `() / sizeof(float); std::vector data(size, v); write_to_dnnl_memory(data.data(), m); } // function to execute non-fused relu void create_and_execute_relu(:ref:`memory ` &data, :ref:`engine ` &eng, :ref:`stream ` &s) { // relu operates on whatever data format is given to it // create a primitive auto relu_pd = :ref:`eltwise_forward::primitive_desc `(eng, :ref:`prop_kind::forward_inference `, :ref:`algorithm::eltwise_relu `, data.:ref:`get_desc `(), data.:ref:`get_desc `(), 0.f, 0.f); auto relu = :ref:`eltwise_forward `(relu_pd); // execute it (in-place) relu.execute(s, {{:ref:`DNNL_ARG_SRC `, data}, {:ref:`DNNL_ARG_DST `, data}}); } // [Create post_op attr with relu] // function to create post-op attribute for fused relu :ref:`primitive_attr ` create_attr_with_relu_post_op() { // create a post-op with relu :ref:`post_ops ` ops; ops.:ref:`append_eltwise `(:ref:`algorithm::eltwise_relu `, 0.f, 0.f); // create an attribute and set the corresponding post op :ref:`primitive_attr ` attr; attr.:ref:`set_post_ops `(ops); return attr; } // [Create post_op attr with relu] // Implementation for naive convolution on nchw (data) and oihw (weights), // followed by execution of non-fused relu void conv_relu_naive(const :ref:`memory ` &user_src, const :ref:`memory ` &user_wei, :ref:`memory ` user_dst, :ref:`engine ` &eng, :ref:`stream ` &s) { // [Create mem_desc] // copy the dimensions and format from user's memory auto conv_src_md = :ref:`memory::desc `(user_src.:ref:`get_desc `()); auto conv_wei_md = :ref:`memory::desc `(user_wei.:ref:`get_desc `()); auto conv_dst_md = :ref:`memory::desc `(user_dst.:ref:`get_desc `()); // [Create mem_desc] // [Create conv_prim_desc] // create a convolution primitive descriptor auto conv_pd = :ref:`convolution_forward::primitive_desc `(eng, :ref:`prop_kind::forward_inference `, :ref:`algorithm::convolution_direct `, conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding); // [Create conv_prim_desc] // [Create conv_primitive] // create convolution primitive auto conv = :ref:`convolution_forward `(conv_pd); // [Create conv_primitive] // [Add to stream] // execute convolution by adding it to the stream s conv.execute(s, {{:ref:`DNNL_ARG_SRC `, user_src}, {:ref:`DNNL_ARG_WEIGHTS `, user_wei}, {:ref:`DNNL_ARG_DST `, user_dst}}); // [Add to stream] // [Create and execute relu] // execute relu (on convolution's destination format, whatever it is) create_and_execute_relu(user_dst, eng, s); s.:ref:`wait `(); // [Create and execute relu] } // Implementation for convolution on blocked format for data and // weights, followed by execution of non-fused relu void conv_relu_blocked(:ref:`memory ` user_src, :ref:`memory ` user_wei, :ref:`memory ` user_dst, :ref:`engine ` &eng, :ref:`stream ` &s) { // [Create mem_desc with tag=any] // copy the dimensions and data type from user's memory and set format tag // to "any" to allow convolution to pick the best implementation auto conv_src_md = :ref:`memory::desc `(user_src.:ref:`get_desc `().:ref:`get_dims `(), user_src.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); auto conv_wei_md = :ref:`memory::desc `(user_wei.:ref:`get_desc `().:ref:`get_dims `(), user_wei.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); auto conv_dst_md = :ref:`memory::desc `(user_dst.:ref:`get_desc `().:ref:`get_dims `(), user_dst.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); // [Create mem_desc with tag=any] // [Create conv_prim_desc implementation2] // create a convolution primitive descriptor and primitive auto conv_pd = :ref:`convolution_forward::primitive_desc `(eng, :ref:`prop_kind::forward_inference `, :ref:`algorithm::convolution_direct `, conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding); // [Create conv_prim_desc implementation2] // [Conditionally create and execute reorder prims] // prepare convolution source :ref:`memory ` conv_src = user_src; if (conv_pd.src_desc() != user_src.:ref:`get_desc `()) { conv_src = :ref:`memory `(conv_pd.src_desc(), eng); auto r_pd = :ref:`reorder::primitive_desc `(user_src, conv_src); :ref:`reorder `(r_pd).:ref:`execute `(s, user_src, conv_src); } // prepare convolution weights :ref:`memory ` conv_wei = user_wei; if (conv_pd.weights_desc() != user_wei.:ref:`get_desc `()) { conv_wei = :ref:`memory `(conv_pd.weights_desc(), eng); auto r_pd = :ref:`reorder::primitive_desc `(user_wei, conv_wei); :ref:`reorder `(r_pd).:ref:`execute `(s, user_wei, conv_wei); } // prepare convolution destination :ref:`memory ` conv_dst = user_dst; if (conv_pd.dst_desc() != user_dst.:ref:`get_desc `()) conv_dst = :ref:`memory `(conv_pd.dst_desc(), eng); // [Conditionally create and execute reorder prims] // [Create conv_primitive implementation2] // create convolution primitive auto conv = :ref:`convolution_forward `(conv_pd); // [Create conv_primitive implementation2] // [Add to stream implementation2] // execute convolution by adding it to the stream s conv.execute(s, {{:ref:`DNNL_ARG_SRC `, conv_src}, {:ref:`DNNL_ARG_WEIGHTS `, conv_wei}, {:ref:`DNNL_ARG_DST `, conv_dst}}); // [Add to stream implementation2] // [Create and execute relu implementation2] // execute relu (on convolution's destination format, whatever it is) create_and_execute_relu(conv_dst, eng, s); // [Create and execute relu implementation2] if (conv_pd.dst_desc() != user_dst.:ref:`get_desc `()) { auto r_pd = :ref:`reorder::primitive_desc `(conv_dst, user_dst); :ref:`reorder `(r_pd).:ref:`execute `(s, conv_dst, user_dst); } s.:ref:`wait `(); // reorder data to the user's format if needed. } // Implementation for convolution on blocked format for data and // weights and the relu operation fused via a post-op attribute added to the // convolution prim_descriptor void conv_relu_fused(:ref:`memory ` user_src, :ref:`memory ` user_wei, :ref:`memory ` user_dst, const :ref:`engine ` &eng, :ref:`stream ` &s) { // copy the dimensions data type from user's memory and set format tag // to any to allow convolution to pick the best implementation auto conv_src_md = :ref:`memory::desc `(user_src.:ref:`get_desc `().:ref:`get_dims `(), user_src.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); auto conv_wei_md = :ref:`memory::desc `(user_wei.:ref:`get_desc `().:ref:`get_dims `(), user_wei.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); auto conv_dst_md = :ref:`memory::desc `(user_dst.:ref:`get_desc `().:ref:`get_dims `(), user_dst.:ref:`get_desc `().:ref:`get_data_type `(), :ref:`memory::format_tag::any `); // Next the convolution prim descriptor is created, which inherits the ReLU // [Create prim_desc with attr] // create an attribute for fused relu auto attr = create_attr_with_relu_post_op(); // create a convolution primitive descriptor auto conv_pd = :ref:`convolution_forward::primitive_desc `(eng, :ref:`prop_kind::forward_inference `, :ref:`algorithm::convolution_direct `, conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding, attr); // [Create prim_desc with attr] // prepare convolution source :ref:`memory ` conv_src = user_src; if (conv_pd.src_desc() != user_src.:ref:`get_desc `()) { conv_src = :ref:`memory `(conv_pd.src_desc(), eng); auto r_pd = :ref:`reorder::primitive_desc `(user_src, conv_src); :ref:`reorder `(r_pd).:ref:`execute `(s, user_src, conv_src); } // prepare convolution weights :ref:`memory ` conv_wei = user_wei; if (conv_pd.weights_desc() != user_wei.:ref:`get_desc `()) { conv_wei = :ref:`memory `(conv_pd.weights_desc(), eng); auto r_pd = :ref:`reorder::primitive_desc `(user_wei, conv_wei); :ref:`reorder `(r_pd).:ref:`execute `(s, user_wei, conv_wei); } // prepare convolution destination :ref:`memory ` conv_dst = user_dst; if (conv_pd.dst_desc() != user_dst.:ref:`get_desc `()) conv_dst = :ref:`memory `(conv_pd.dst_desc(), eng); // [Create conv_primitive implementation3] // create convolution primitive auto conv = :ref:`convolution_forward `(conv_pd); // [Create conv_primitive implementation3] // [Add to stream implementation3] // execute convolution by adding it to the stream s conv.execute(s, {{:ref:`DNNL_ARG_SRC `, conv_src}, {:ref:`DNNL_ARG_WEIGHTS `, conv_wei}, {:ref:`DNNL_ARG_DST `, conv_dst}}); // [Add to stream implementation3] // reorder data to user's format if needed if (conv_pd.dst_desc() != user_dst.:ref:`get_desc `()) { auto r_pd = :ref:`reorder::primitive_desc `(conv_dst, user_dst); :ref:`reorder `(r_pd).:ref:`execute `(s, conv_dst, user_dst); } s.:ref:`wait `(); } void performance_profiling(:ref:`engine::kind ` engine_kind, int argc, char **argv) { // Initialize engine :ref:`engine ` eng(engine_kind, 0); // Initialize stream :ref:`stream ` s(eng); // [Set dimensions] // set dimensions for synthetic data and weights const :ref:`memory::dim ` BATCH = 128; const :ref:`memory::dim ` IC = 3, OC = 96; const :ref:`memory::dim ` IH = 227, KH = 11, OH = 55; const :ref:`memory::dim ` IW = 227, KW = 11, OW = 55; // [Set dimensions] // [Create memory objects] // create oneDNN memory objects for user's tensors (in nchw and oihw formats) auto user_src = :ref:`memory `({{BATCH, IC, IH, IW}, :ref:`memory::data_type::f32 `, :ref:`memory::format_tag::nchw `}, eng); auto user_wei = :ref:`memory `({{OC, IC, KH, KW}, :ref:`memory::data_type::f32 `, :ref:`memory::format_tag::oihw `}, eng); auto user_dst = :ref:`memory `({{BATCH, OC, OH, OW}, :ref:`memory::data_type::f32 `, :ref:`memory::format_tag::nchw `}, eng); // [Create memory objects] // fill source, destination, and weights with synthetic data init_data(user_src, 1); init_data(user_dst, -1); init_data(user_wei, .5); // set implementation ("naive"||"blocked"||"fused") setting implementation // to "validation" will run all implementations std::string implementation; if (argc <= 2) implementation = "validation"; else if (argc == 3) implementation = argv[2]; if (!(implementation == "validation" || implementation == "naive" || implementation == "blocked" || implementation == "fused")) { std::cout << "The implementation can be one of:\n"; std::cout << " - naive: NCHW format without fusion\n"; std::cout << " - blocked: format propagation without fusion\n"; std::cout << " - fused: format propagation with fusion\n"; std::cout << " - validation: runs all implementations\n\n"; std::cout << "Validation will run if no parameters are specified.\n\n"; throw std::invalid_argument("Incorrect input arguments."); } if (implementation == "naive" || implementation == "validation") { std::cout << "Implementation: naive.\n"; // run conv + relu w/o fusing conv_relu_naive(user_src, user_wei, user_dst, eng, s); std::cout << "Conv + ReLU w/ nchw format completed.\n"; } if (implementation == "blocked" || implementation == "validation") { std::cout << "Implementation: blocked.\n"; // run conv + relu w/o fusing conv_relu_blocked(user_src, user_wei, user_dst, eng, s); std::cout << "Conv + ReLU w/ blocked format completed.\n"; } if (implementation == "fused" || implementation == "validation") { std::cout << "Implementation: fused.\n"; // run conv + relu w/ fusing conv_relu_fused(user_src, user_wei, user_dst, eng, s); std::cout << "Conv + ReLU w/ fusing completed.\n"; } } int main(int argc, char **argv) { :ref:`engine::kind ` engine_kind = parse_engine_kind(argc, argv, 1); return handle_example_errors( performance_profiling, engine_kind, argc, argv); }