This example demonstrates the best practices for application performance optimizations with Intel MKL-DNN.
#include <iostream>
#include <string>
#include <chrono>
#include <stdio.h>
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};
void init_data(
memory &m,
float v) {
for (size_t i = 0; i < size; ++i)
data[i] = v;
}
void create_and_execute_relu(
memory data) {
relu.execute(s, {
{MKLDNN_ARG_SRC, data},
{MKLDNN_ARG_DST, data}});
}
return attr;
}
conv_src_md, conv_wei_md, conv_dst_md,
strides, padding, padding);
conv.execute(s, {
{MKLDNN_ARG_SRC, user_src},
{MKLDNN_ARG_WEIGHTS, user_wei},
{MKLDNN_ARG_DST, user_dst}});
create_and_execute_relu(user_dst);
}
conv_src_md, conv_wei_md, conv_dst_md,
strides, padding, padding);
if (conv_pd.src_desc() != user_src.
get_desc()) {
conv_src =
memory(conv_pd.src_desc(), cpu);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).
execute(s, user_src, conv_src);
}
if (conv_pd.weights_desc() != user_wei.
get_desc()) {
conv_wei =
memory(conv_pd.weights_desc(), cpu);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
if (conv_pd.dst_desc() != user_dst.
get_desc())
conv_dst =
memory(conv_pd.dst_desc(), cpu);
conv.execute(s, {
{MKLDNN_ARG_SRC, conv_src},
{MKLDNN_ARG_WEIGHTS, conv_wei},
{MKLDNN_ARG_DST, conv_dst}});
create_and_execute_relu(conv_dst);
if (conv_pd.dst_desc() != user_dst.
get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
}
conv_src_md, conv_wei_md, conv_dst_md,
strides, padding, padding);
auto attr = create_attr_with_relu_post_op();
if (conv_pd.src_desc() != user_src.
get_desc()) {
conv_src =
memory(conv_pd.src_desc(), cpu);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
if (conv_pd.weights_desc() != user_wei.
get_desc()) {
conv_wei =
memory(conv_pd.weights_desc(), cpu);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
if (conv_pd.dst_desc() != user_dst.
get_desc())
conv_dst =
memory(conv_pd.dst_desc(), cpu);
conv.execute(s, {
{MKLDNN_ARG_SRC, conv_src},
{MKLDNN_ARG_WEIGHTS, conv_wei},
{MKLDNN_ARG_DST, conv_dst}});
if (conv_pd.dst_desc() != user_dst.
get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
}
int main(int argc, char *argv[]) {
const memory::dim BATCH = 1000;
const memory::dim IC = 3, OC = 96;
const memory::dim IH = 227, KH = 11, OH = 55;
const memory::dim IW = 227, KW = 11, OW = 55;
memory::format_tag::oihw}, cpu);
init_data(user_src, 1);
init_data(user_dst, -1);
init_data(user_wei, .5);
std::string implementation;
if (argc == 1)
implementation = "validation";
else if (argc == 2)
implementation = argv[1];
if (!(implementation == "validation"
|| implementation == "naive"
|| implementation == "blocked"
|| implementation == "fused")) {
std::cout << "\nUsage: " << argv[0]
<< " [implementation]\n\n";
std::cout << "The implementation can be one of:\n";
std::cout << " - naive: NCHW format without fusion\n";
std::cout << " - blocked: format propagation without fusion\n";
std::cout << " - fused: format propagation with fusion\n";
std::cout << " - validation: runs all implementations\n\n";
std::cout << "Validation will be run if no parameters are specified\n\n";
return -1;
}
if (implementation == "naive" || implementation == "validation") {
std::cout << "implementation: naive\n";
conv_relu_naive(user_src, user_wei, user_dst);
std::cout << "conv + relu w/ nchw format completed\n";
}
if (implementation == "blocked" || implementation == "validation") {
std::cout << "implementation: blocked\n";
conv_relu_blocked(user_src, user_wei, user_dst);
std::cout << "conv + relu w/ blocked format completed\n";
}
if (implementation == "fused" || implementation == "validation") {
std::cout << "implementation: fused\n";
conv_relu_fused(user_src, user_wei, user_dst);
std::cout << "conv + relu w/ fusing completed\n";
}
return 0;
}