.. index:: pair: page; MatMul Tutorial: Comparison with SGEMM .. _doxid-cpu_sgemm_and_matmul_cpp: MatMul Tutorial: Comparison with SGEMM ====================================== C++ API example demonstrating :ref:`MatMul ` as a replacement for SGEMM functions. Concepts: * Create primitive once, use multiple times * Run-time tensor shapes: :ref:`DNNL_RUNTIME_DIM_VAL ` * Scales: :ref:`dnnl::primitive_attr::set_scales_mask() ` We will show two modes for the MatMul primitive: #. The shapes of the input and output matrices are passed at execution time. This enables you to create a primitive only once and use it for different matrices, just like normal SGEMM (though with a handle oneDNN primitive). To indicate the unknown dimensions and floating point values, you should use :ref:`DNNL_RUNTIME_DIM_VAL ` and :ref:`DNNL_RUNTIME_F32_VAL ` respectively. #. The shapes of the input and output matrices are passed at creation time, as in oneDNN programming model. This enables creating a highly specialized kernel for the given problem sizes with the loss of generality. Users are free to choose between these two options, as well as any intermediate ones (e.g., specifying some of the parameters at creation time while leaving the others until execution time). This enables balancing between flexibility and performance. .. note:: The more you specify at creation time, the better performance is. .. ref-code-block:: cpp /******************************************************************************* * Copyright 2019-2022 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include #include #include #include #include #include #include #include #include "oneapi/dnnl/dnnl.hpp" #include "example_utils.hpp" using namespace :ref:`dnnl `; namespace { void init_vector(std::vector &v) { std::mt19937 gen; std::uniform_real_distribution u(-1, 1); for (auto &e : v) e = u(gen); } int compare_vectors(const std::vector &v1, const std::vector &v2, int64_t K, const char *message) { double v1_l2 = 0, diff_l2 = 0; for (size_t n = 0; n < v1.size(); ++n) { float diff = v1[n] - v2[n]; v1_l2 += v1[n] * v1[n]; diff_l2 += diff * diff; } v1_l2 = std::sqrt(v1_l2); diff_l2 = std::sqrt(diff_l2); // Finding the reasonable (tight and accurate) threshold is quite difficult // problem. // The implementation testing might also use special data filling to // alleviate issues related to the finite precision arithmetic. // However, in simple cases the machine epsilon multiplied by log(K) should // work reasonably well. const double threshold = std::numeric_limits::epsilon() * std::log(std::max(2., (double)K)); bool ok = diff_l2 <= threshold * v1_l2; printf("%s\n\tL2 Norms" "\n\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative_error:%g\n" "\tAccuracy check: %s\n", message, v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED"); return ok ? 0 : 1; } } // namespace int number_of_runs = 1; float fixed_beta = 0.f; :ref:`engine ` eng(:ref:`engine::kind::cpu `, 0); // We create a global engine for simplicity // Create a _dynamic_ MatMul primitive that can work with arbitrary shapes // and alpha parameters. // Warning: current limitation is that beta parameter should be known in // advance (use fixed_beta). :ref:`matmul ` dynamic_matmul_create() { // We assume that beta is known at the primitive creation time float beta = fixed_beta; :ref:`memory::dims ` a_shape = {:ref:`DNNL_RUNTIME_DIM_VAL `, :ref:`DNNL_RUNTIME_DIM_VAL `}; :ref:`memory::dims ` b_shape = {:ref:`DNNL_RUNTIME_DIM_VAL `, :ref:`DNNL_RUNTIME_DIM_VAL `}; :ref:`memory::dims ` c_shape = {:ref:`DNNL_RUNTIME_DIM_VAL `, :ref:`DNNL_RUNTIME_DIM_VAL `}; :ref:`memory::dims ` a_strides = {:ref:`DNNL_RUNTIME_DIM_VAL `, :ref:`DNNL_RUNTIME_DIM_VAL `}; :ref:`memory::dims ` b_strides = {:ref:`DNNL_RUNTIME_DIM_VAL `, :ref:`DNNL_RUNTIME_DIM_VAL `}; :ref:`memory::dims ` c_strides = {:ref:`DNNL_RUNTIME_DIM_VAL `, 1}; :ref:`memory::desc ` a_md(a_shape, :ref:`memory::data_type::f32 `, a_strides); :ref:`memory::desc ` b_md(b_shape, :ref:`memory::data_type::f32 `, b_strides); :ref:`memory::desc ` c_md(c_shape, :ref:`memory::data_type::f32 `, c_strides); // Create attributes (to handle alpha dynamically and beta if necessary) :ref:`primitive_attr ` attr; attr.:ref:`set_scales_mask `(:ref:`DNNL_ARG_WEIGHTS `, /* mask */ 0); if (beta != 0.f) { :ref:`post_ops ` po; po.:ref:`append_sum `(beta); attr.:ref:`set_post_ops `(po); } // Create a MatMul primitive :ref:`matmul::primitive_desc ` matmul_pd(eng, a_md, b_md, c_md, attr); return :ref:`matmul `(matmul_pd); } // Execute a _dynamic_ MatMul primitive created earlier. All the parameters are // passed at a run-time (except for beta which has to be specified at the // primitive creation time due to the current limitation). void dynamic_matmul_execute(:ref:`matmul ` &matmul_p, char transA, char transB, int64_t M, int64_t N, int64_t K, float alpha, const float *A, int64_t lda, const float *B, int64_t ldb, float beta, float *C, int64_t ldc) { using dims = :ref:`memory::dims `; if (beta != fixed_beta) throw std::logic_error("Run-time beta is not yet supported."); // Translate transA and transB dims a_strides = tolower(transA) == 'n' ? dims {lda, 1} : dims {1, lda}; dims b_strides = tolower(transB) == 'n' ? dims {ldb, 1} : dims {1, ldb}; // Wrap raw pointers into oneDNN memories (with proper shapes) :ref:`memory ` A_m({{M, K}, :ref:`memory::data_type::f32 `, a_strides}, eng, (void *)A); :ref:`memory ` B_m({{K, N}, :ref:`memory::data_type::f32 `, b_strides}, eng, (void *)B); :ref:`memory ` C_m({{M, N}, :ref:`memory::data_type::f32 `, {ldc, 1}}, eng, (void *)C); // Prepare oneDNN memory for alpha :ref:`memory ` alpha_m({{1}, :ref:`memory::data_type::f32 `, {1}}, eng, &alpha); // Execute the MatMul primitive :ref:`stream ` s(eng); matmul_p.:ref:`execute `(s, {{:ref:`DNNL_ARG_SRC `, A_m}, {:ref:`DNNL_ARG_WEIGHTS `, B_m}, {:ref:`DNNL_ARG_DST `, C_m}, {:ref:`DNNL_ARG_ATTR_SCALES ` | :ref:`DNNL_ARG_WEIGHTS `, alpha_m}}); s.wait(); } void sgemm_and_matmul_with_params(char transA, char transB, int64_t M, int64_t N, int64_t K, float alpha, float beta) { if (beta != fixed_beta) throw std::logic_error("Run-time beta is not yet supported."); // Allocate and initialize matrices std::vector A(M * K); init_vector(A); std::vector B(K * N); init_vector(B); std::vector C_sgemm(M * N); init_vector(C_sgemm); std::vector C_dynamic_matmul = C_sgemm; // Prepare leading dimensions int64_t lda = tolower(transA) == 'n' ? K : M; int64_t ldb = tolower(transB) == 'n' ? N : K; int64_t ldc = N; // 1. Execute sgemm for (int run = 0; run < number_of_runs; ++run) :ref:`dnnl_sgemm `(transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C_sgemm.data(), ldc); // 2.a Create dynamic MatMul auto dynamic_matmul = dynamic_matmul_create(); // 2.b Execute for (int run = 0; run < number_of_runs; ++run) dynamic_matmul_execute(dynamic_matmul, transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C_dynamic_matmul.data(), ldc); int rc = 0; rc |= compare_vectors( C_sgemm, C_dynamic_matmul, K, "Compare SGEMM vs dynamic MatMul"); if (rc) throw std::logic_error("The resulting matrices diverged too much."); } void sgemm_and_matmul() { sgemm_and_matmul_with_params('N', 'T', 10, 20, 30, 1.1f, fixed_beta); } int main(int argc, char **argv) { return handle_example_errors({:ref:`engine::kind::cpu `}, sgemm_and_matmul); }