performance_profiling.cpp

This example demonstrates the best practices for application performance optimizations with oneDNN. Annotated version: Performance Profiling Example

This example demonstrates the best practices for application performance optimizations with oneDNN. Annotated version: Performance Profiling Example

/*******************************************************************************
* Copyright 2019-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/



#include <iostream>
#include <stdexcept>
#include <vector>

#include "oneapi/dnnl/dnnl.hpp"

#include "example_utils.hpp"

using namespace dnnl;

// [Prologue]

// Set Strides and Padding
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};

// [Prologue]
//
// function to init data
void init_data(memory &m, float v) {
    size_t size = m.get_desc().get_size() / sizeof(float);
    std::vector<float> data(size, v);
    write_to_dnnl_memory(data.data(), m);
}

// function to execute non-fused relu
void create_and_execute_relu(memory &data, engine &eng, stream &s) {
    // relu operates on whatever data format is given to it

    // create a primitive
    auto relu_d = eltwise_forward::desc(prop_kind::forward_inference,
            algorithm::eltwise_relu, data.get_desc(), 0.f, 0.f);
    auto relu_pd = eltwise_forward::primitive_desc(relu_d, eng);
    auto relu = eltwise_forward(relu_pd);

    // execute it (in-place)
    relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
}

// [Create post_op attr with relu]
// function to create post-op attribute for fused relu
primitive_attr create_attr_with_relu_post_op() {
    // create a post-op with relu
    post_ops ops;
    ops.append_eltwise(1.f, algorithm::eltwise_relu, 0.f, 0.f);

    // create an attribute and set the corresponding post op
    primitive_attr attr;
    attr.set_post_ops(ops);

    return attr;
}
// [Create post_op attr with relu]

// Implementation for naive convolution on nchw (data) and oihw (weights),
// followed by execution of non-fused relu
void conv_relu_naive(const memory &user_src, const memory &user_wei,
        memory user_dst, engine &eng, stream &s) {
    // [Create mem_desc]
    // copy the dimensions and format from user's memory
    auto conv_src_md = memory::desc(user_src.get_desc());
    auto conv_wei_md = memory::desc(user_wei.get_desc());
    auto conv_dst_md = memory::desc(user_dst.get_desc());
    // [Create mem_desc]
    // [Create conv_desc]
    // create a convolution descriptor
    auto conv_d = convolution_forward::desc(prop_kind::forward_inference,
            algorithm::convolution_direct, conv_src_md, conv_wei_md,
            conv_dst_md, strides, padding, padding);
    // [Create conv_desc]
    // [Create conv_prim_desc]
    // create a convolution primitive descriptor
    auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
    // [Create conv_prim_desc]
    // [Create conv_primitive]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive]
    // [Add to stream]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
                    {DNNL_ARG_DST, user_dst}});
    // [Add to stream]
    // [Create and execute relu]
    // execute relu (on convolution's destination format, whatever it is)
    create_and_execute_relu(user_dst, eng, s);
    s.wait();
    // [Create and execute relu]
}

// Implementation for convolution on blocked format for data and
// weights, followed by execution of non-fused relu
void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
        engine &eng, stream &s) {
    // [Create mem_desc with tag=any]
    // copy the dimensions and format from user's memory
    auto conv_src_md = memory::desc(user_src.get_desc());
    auto conv_wei_md = memory::desc(user_wei.get_desc());
    auto conv_dst_md = memory::desc(user_dst.get_desc());

    // reset format to "any" to allow convolution to pick the best implementation
    conv_src_md.data.format_kind = dnnl_format_kind_any;
    conv_wei_md.data.format_kind = dnnl_format_kind_any;
    conv_dst_md.data.format_kind = dnnl_format_kind_any;
    // [Create mem_desc with tag=any]
    // [Create conv_desc implementation2]
    // create a convolution descriptor
    auto conv_d = convolution_forward::desc(prop_kind::forward_inference,
            algorithm::convolution_direct, conv_src_md, conv_wei_md,
            conv_dst_md, strides, padding, padding);
    // [Create conv_desc implementation2]
    // [Create conv_prim_desc implementation2]
    // create a convolution primitive descriptor and primitive
    auto conv_pd = convolution_forward::primitive_desc(conv_d, eng);
    // [Create conv_prim_desc implementation2]
    // [Conditionally create and execute reorder prims]
    // prepare convolution source
    memory conv_src = user_src;
    if (conv_pd.src_desc() != user_src.get_desc()) {
        conv_src = memory(conv_pd.src_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_src, conv_src);
        reorder(r_pd).execute(s, user_src, conv_src);
    }

    // prepare convolution weights
    memory conv_wei = user_wei;
    if (conv_pd.weights_desc() != user_wei.get_desc()) {
        conv_wei = memory(conv_pd.weights_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
        reorder(r_pd).execute(s, user_wei, conv_wei);
    }

    // prepare convolution destination
    memory conv_dst = user_dst;
    if (conv_pd.dst_desc() != user_dst.get_desc())
        conv_dst = memory(conv_pd.dst_desc(), eng);
    // [Conditionally create and execute reorder prims]
    // [Create conv_primitive implementation2]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive implementation2]
    // [Add to stream implementation2]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
                    {DNNL_ARG_DST, conv_dst}});
    // [Add to stream implementation2]
    // [Create and execute relu implementation2]
    // execute relu (on convolution's destination format, whatever it is)
    create_and_execute_relu(conv_dst, eng, s);
    // [Create and execute relu implementation2]
    if (conv_pd.dst_desc() != user_dst.get_desc()) {
        auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
        reorder(r_pd).execute(s, conv_dst, user_dst);
    }
    s.wait();
    // reorder data to the user's format if needed.
}

// Implementation for convolution on blocked format for data and
// weights and the relu operation fused via a post-op attribute added to the
// convolution prim_descriptor
void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
        const engine &eng, stream &s) {
    // copy the dimensions and format from user's memory
    auto conv_src_md = memory::desc(user_src.get_desc());
    auto conv_wei_md = memory::desc(user_wei.get_desc());
    auto conv_dst_md = memory::desc(user_dst.get_desc());

    // reset format to any to allow convolution to pick the best implementation
    conv_src_md.data.format_kind = dnnl_format_kind_any;
    conv_wei_md.data.format_kind = dnnl_format_kind_any;
    conv_dst_md.data.format_kind = dnnl_format_kind_any;

    // create a convolution descriptor
    auto conv_d = convolution_forward::desc(prop_kind::forward_inference,
            algorithm::convolution_direct, conv_src_md, conv_wei_md,
            conv_dst_md, strides, padding, padding);

    // Next the convolution prim descriptor is created, which inherits the ReLU
    // [Create prim_desc with attr]
    // create an attribute for fused relu
    auto attr = create_attr_with_relu_post_op();

    // create a convolution primitive descriptor
    auto conv_pd = convolution_forward::primitive_desc(conv_d, attr, eng);
    // [Create prim_desc with attr]
    // prepare convolution source
    memory conv_src = user_src;
    if (conv_pd.src_desc() != user_src.get_desc()) {
        conv_src = memory(conv_pd.src_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_src, conv_src);
        reorder(r_pd).execute(s, user_src, conv_src);
    }

    // prepare convolution weights
    memory conv_wei = user_wei;
    if (conv_pd.weights_desc() != user_wei.get_desc()) {
        conv_wei = memory(conv_pd.weights_desc(), eng);
        auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
        reorder(r_pd).execute(s, user_wei, conv_wei);
    }

    // prepare convolution destination
    memory conv_dst = user_dst;
    if (conv_pd.dst_desc() != user_dst.get_desc())
        conv_dst = memory(conv_pd.dst_desc(), eng);
    // [Create conv_primitive implementation3]
    // create convolution primitive
    auto conv = convolution_forward(conv_pd);
    // [Create conv_primitive implementation3]
    // [Add to stream implementation3]
    // execute convolution by adding it to the stream s
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
                    {DNNL_ARG_DST, conv_dst}});
    // [Add to stream implementation3]
    // reorder data to user's format if needed
    if (conv_pd.dst_desc() != user_dst.get_desc()) {
        auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
        reorder(r_pd).execute(s, conv_dst, user_dst);
    }
    s.wait();
}


void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
    // Initialize engine
    engine eng(engine_kind, 0);

    // Initialize stream
    stream s(eng);
    // [Set dimensions]
    // set dimensions for synthetic data and weights
    const memory::dim BATCH = 128;
    const memory::dim IC = 3, OC = 96;
    const memory::dim IH = 227, KH = 11, OH = 55;
    const memory::dim IW = 227, KW = 11, OW = 55;
    // [Set dimensions]

    // [Create memory objects]
    // create oneDNN memory objects for user's tensors (in nchw and oihw formats)
    auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
                                   memory::format_tag::nchw},
            eng);
    auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
                                   memory::format_tag::oihw},
            eng);
    auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
                                   memory::format_tag::nchw},
            eng);
    // [Create memory objects]

    // fill source, destination, and weights with synthetic data
    init_data(user_src, 1);
    init_data(user_dst, -1);
    init_data(user_wei, .5);

    // set implementation ("naive"||"blocked"||"fused") setting implementation
    // to "validation" will run all implementations
    std::string implementation;
    if (argc <= 2)
        implementation = "validation";
    else if (argc == 3)
        implementation = argv[2];

    if (!(implementation == "validation" || implementation == "naive"
                || implementation == "blocked" || implementation == "fused")) {
        std::cout << "The implementation can be one of:\n";
        std::cout << " - naive: NCHW format without fusion\n";
        std::cout << " - blocked: format propagation without fusion\n";
        std::cout << " - fused: format propagation with fusion\n";
        std::cout << " - validation: runs all implementations\n\n";
        std::cout << "Validation will run if no parameters are specified.\n\n";

        throw std::invalid_argument("Incorrect input arguments.");
    }

    if (implementation == "naive" || implementation == "validation") {
        std::cout << "Implementation: naive.\n";
        // run conv + relu w/o fusing
        conv_relu_naive(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ nchw format completed.\n";
    }

    if (implementation == "blocked" || implementation == "validation") {
        std::cout << "Implementation: blocked.\n";
        // run conv + relu w/o fusing
        conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ blocked format completed.\n";
    }

    if (implementation == "fused" || implementation == "validation") {
        std::cout << "Implementation: fused.\n";
        // run conv + relu w/ fusing
        conv_relu_fused(user_src, user_wei, user_dst, eng, s);
        std::cout << "Conv + ReLU w/ fusing completed.\n";
    }
}

int main(int argc, char **argv) {
    engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
    return handle_example_errors(
            performance_profiling, engine_kind, argc, argv);
}