This example demonstrates the best practices for application performance optimizations with DNNL.
#include <chrono>
#include <iostream>
#include <stdio.h>
#include <vector>
#include "example_utils.hpp"
const memory::dims strides = {4, 4};
const memory::dims padding = {0, 0};
void init_data(
memory &m,
float v) {
std::vector<float> data(size);
read_from_dnnl_memory(data.data(), m);
for (size_t i = 0; i < size; ++i)
data[i] = v;
}
relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
}
return attr;
}
conv_dst_md, strides, padding, padding);
conv.execute(s,
{{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
{DNNL_ARG_DST, user_dst}});
create_and_execute_relu(user_dst, eng, s);
}
conv_dst_md, strides, padding, padding);
if (conv_pd.src_desc() != user_src.
get_desc()) {
conv_src =
memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).
execute(s, user_src, conv_src);
}
if (conv_pd.weights_desc() != user_wei.
get_desc()) {
conv_wei =
memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
if (conv_pd.dst_desc() != user_dst.
get_desc())
conv_dst =
memory(conv_pd.dst_desc(), eng);
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
create_and_execute_relu(conv_dst, eng, s);
if (conv_pd.dst_desc() != user_dst.
get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
}
conv_dst_md, strides, padding, padding);
auto attr = create_attr_with_relu_post_op();
if (conv_pd.src_desc() != user_src.
get_desc()) {
conv_src =
memory(conv_pd.src_desc(), eng);
auto r_pd = reorder::primitive_desc(user_src, conv_src);
reorder(r_pd).execute(s, user_src, conv_src);
}
if (conv_pd.weights_desc() != user_wei.
get_desc()) {
conv_wei =
memory(conv_pd.weights_desc(), eng);
auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
reorder(r_pd).execute(s, user_wei, conv_wei);
}
if (conv_pd.dst_desc() != user_dst.
get_desc())
conv_dst =
memory(conv_pd.dst_desc(), eng);
conv.execute(s,
{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
{DNNL_ARG_DST, conv_dst}});
if (conv_pd.dst_desc() != user_dst.
get_desc()) {
auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
reorder(r_pd).execute(s, conv_dst, user_dst);
}
}
int main(int argc, char *argv[]) {
engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
const memory::dim BATCH = 128;
const memory::dim IC = 3, OC = 96;
const memory::dim IH = 227, KH = 11, OH = 55;
const memory::dim IW = 227, KW = 11, OW = 55;
eng);
memory::format_tag::oihw},
eng);
eng);
init_data(user_src, 1);
init_data(user_dst, -1);
init_data(user_wei, .5);
std::string implementation;
if (argc <= 2)
implementation = "validation";
else if (argc == 3)
implementation = argv[2];
if (!(implementation == "validation" || implementation == "naive"
|| implementation == "blocked" || implementation == "fused")) {
std::cout << "The implementation can be one of:\n";
std::cout << " - naive: NCHW format without fusion\n";
std::cout << " - blocked: format propagation without fusion\n";
std::cout << " - fused: format propagation with fusion\n";
std::cout << " - validation: runs all implementations\n\n";
std::cout << "Validation will run if no parameters are specified\n\n";
return -1;
}
if (implementation == "naive" || implementation == "validation") {
std::cout << "implementation: naive\n";
conv_relu_naive(user_src, user_wei, user_dst, eng, s);
std::cout << "conv + relu w/ nchw format completed\n";
}
if (implementation == "blocked" || implementation == "validation") {
std::cout << "implementation: blocked\n";
conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
std::cout << "conv + relu w/ blocked format completed\n";
}
if (implementation == "fused" || implementation == "validation") {
std::cout << "implementation: fused\n";
conv_relu_fused(user_src, user_wei, user_dst, eng, s);
std::cout << "conv + relu w/ fusing completed\n";
}
return 0;
}