/******************************************************************************* * Copyright 2019-2022 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ /// @example gpu_opencl_interop.cpp /// @copybrief gpu_opencl_interop_cpp /// > Annotated version: @ref gpu_opencl_interop_cpp /// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API /// This C++ API example demonstrates programming for Intel(R) Processor /// Graphics with OpenCL* extensions API in oneDNN. /// /// > Example code: @ref gpu_opencl_interop.cpp /// /// The workflow includes following steps: /// - Create a GPU engine. It uses OpenCL as the runtime in this sample. /// - Create a GPU memory descriptor/object. /// - Create an OpenCL kernel for GPU data initialization /// - Access a GPU memory via OpenCL interoperability interface /// - Access a GPU command queue via OpenCL interoperability interface /// - Execute a OpenCL kernel with related GPU command queue and GPU memory /// - Create operation descriptor/operation primitives descriptor/primitive . /// - Execute the primitive with the initialized GPU memory /// - Validate the result by mapping the OpenCL memory via OpenCL interoperability /// interface /// /// @page gpu_opencl_interop_cpp /// @section gpu_opencl_interop_cpp_headers Public headers /// /// To start using oneDNN, we must first include the @ref dnnl.hpp /// header file in the application. We also include CL/cl.h for using /// OpenCL APIs and @ref dnnl_debug.h, which contains some debugging /// facilities such as returning a string representation /// for common oneDNN C types. /// All C++ API types and functions reside in the `dnnl` namespace. /// For simplicity of the example we import this namespace. /// @page gpu_opencl_interop_cpp /// @snippet gpu_opencl_interop.cpp Prologue // [Prologue] #include #include #include #include #include "oneapi/dnnl/dnnl.hpp" #include "oneapi/dnnl/dnnl_ocl.hpp" #include "example_utils.hpp" using namespace dnnl; using namespace std; // [Prologue] #define OCL_CHECK(x) \ do { \ cl_int s = (x); \ if (s != CL_SUCCESS) { \ std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \ << "' failed (status code: " << s << ")." << std::endl; \ exit(1); \ } \ } while (0) cl_kernel create_init_opencl_kernel( cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) { cl_int err; const char *sources[] = {ocl_code}; cl_program ocl_program = clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err); OCL_CHECK(err); OCL_CHECK( clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr)); cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err); OCL_CHECK(err); OCL_CHECK(clReleaseProgram(ocl_program)); return ocl_kernel; } /// @page gpu_opencl_interop_cpp /// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function /// void gpu_opencl_interop_tutorial() { /// @page gpu_opencl_interop_cpp /// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream /// /// All oneDNN primitives and memory objects are attached to a /// particular @ref dnnl::engine, which is an abstraction of a /// computational device (see also @ref dev_guide_basic_concepts). The /// primitives are created and optimized for the device to which they are /// attached, and the memory objects refer to memory residing on the /// corresponding device. In particular, that means neither memory objects /// nor primitives that were created for one engine can be used on /// another. /// /// To create engines, we must specify the @ref dnnl::engine::kind /// and the index of the device of the given kind. In this example we use /// the first available GPU engine, so the index for the engine is 0. /// This example assumes OpenCL being a runtime for GPU. In such case, /// during engine creation, an OpenCL context is also created and attaches /// to the created engine. /// /// @snippet gpu_opencl_interop.cpp Initialize engine // [Initialize engine] engine eng(engine::kind::gpu, 0); // [Initialize engine] /// In addition to an engine, all primitives require a @ref dnnl::stream /// for the execution. The stream encapsulates an execution context and is /// tied to a particular engine. /// /// In this example, a GPU stream is created. /// This example assumes OpenCL being a runtime for GPU. During stream creation, /// an OpenCL command queue is also created and attaches to this stream. /// /// @snippet gpu_opencl_interop.cpp Initialize stream // [Initialize stream] dnnl::stream strm(eng); // [Initialize stream] /// @subsection gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object /// /// Next, we create a memory object. We need to specify dimensions of our /// memory by passing a memory::dims object. Then we create a memory /// descriptor with these dimensions, with the dnnl::memory::data_type::f32 /// data type, and with the dnnl::memory::format_tag::nchw memory format. /// Finally, we construct a memory object and pass the memory descriptor. /// The library allocates memory internally. /// @snippet gpu_opencl_interop.cpp memory alloc // [memory alloc] memory::dims tz_dims = {2, 3, 4, 5}; const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1, std::multiplies()); memory::desc mem_d( tz_dims, memory::data_type::f32, memory::format_tag::nchw); memory mem(mem_d, eng); // [memory alloc] /// @subsection gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel /// We are going to create an OpenCL kernel that will initialize our data. /// It requires writing a bit of C code to create an OpenCL program from a /// string literal source. The kernel initializes the data by the /// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`. /// @snippet gpu_opencl_interop.cpp ocl kernel // [ocl kernel] const char *ocl_code = "__kernel void init(__global float *data) {" " int id = get_global_id(0);" " data[id] = (id % 2) ? -id : id;" "}"; // [ocl kernel] /// Create/Build Opencl kernel by `create_init_opencl_kernel()` function. /// Refer to the full code example for the `create_init_opencl_kernel()` /// function. /// @snippet gpu_opencl_interop.cpp oclkernel create // [oclkernel create] const char *kernel_name = "init"; cl_kernel ocl_init_kernel = create_init_opencl_kernel( ocl_interop::get_context(eng), kernel_name, ocl_code); // [oclkernel create] /// The next step is to execute our OpenCL kernel by setting its arguments /// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL /// buffer from the memory object using the interoperability interface: /// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a /// stream, extract the underlying OpenCL queue, and enqueue the kernel to /// this queue. /// @snippet gpu_opencl_interop.cpp oclexecution // [oclexecution] cl_mem ocl_buf = ocl_interop::get_mem_object(mem); OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf)); cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm); OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N, nullptr, 0, nullptr, nullptr)); // [oclexecution] /// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive /// There are three steps to create an operation primitive in oneDNN: /// 1. Create an operation descriptor. /// 2. Create a primitive descriptor. /// 3. Create a primitive. /// /// Let's create the primitive to perform the ReLU (rectified linear unit) /// operation: x = max(0, x). An operation descriptor has no dependency on a /// specific engine - it just describes some operation. On the contrary, /// primitive descriptors are attached to a specific engine and represent /// some implementation for this engine. A primitive object is a realization /// of a primitive descriptor, and its construction is usually much /// "heavier". /// @snippet gpu_opencl_interop.cpp relu creation // [relu creation] auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward, algorithm::eltwise_relu, mem_d, mem_d, 0.0f); auto relu = eltwise_forward(relu_pd); // [relu creation] /// Next, execute the primitive. /// @snippet gpu_opencl_interop.cpp relu exec // [relu exec] relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}}); strm.wait(); // [relu exec] /// ///@note /// Our primitive mem serves as both input and output parameter. /// /// ///@note /// Primitive submission on GPU is asynchronous; However, the user can /// call dnnl:stream::wait() to synchronize the stream and ensure that all /// previously submitted primitives are completed. /// /// @page gpu_opencl_interop_cpp /// @subsection gpu_opencl_interop_cpp_sub5 Validate the results /// /// Before running validation codes, we need to copy the OpenCL memory to /// the host. This can be done using OpenCL API. For convenience, we use a /// utility function read_from_dnnl_memory() implementing required OpenCL API /// calls. After we read the data to the host, we can run validation codes /// on the host accordingly. /// @snippet gpu_opencl_interop.cpp Check the results // [Check the results] std::vector mem_data(N); read_from_dnnl_memory(mem_data.data(), mem); for (size_t i = 0; i < N; i++) { float expected = (i % 2) ? 0.0f : (float)i; if (mem_data[i] != expected) { std::cout << "Expect " << expected << " but got " << mem_data[i] << "." << std::endl; throw std::logic_error("Accuracy check failed."); } } // [Check the results] OCL_CHECK(clReleaseKernel(ocl_init_kernel)); } int main(int argc, char **argv) { return handle_example_errors( {engine::kind::gpu}, gpu_opencl_interop_tutorial); } /// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API /// /// /// /// Upon compiling and running the example, the output should be just: /// /// ~~~ /// Example passed. /// ~~~ ///