// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #pragma once #include "context.hpp" #include "cl_error.hpp" #include "deviceBuffer.hpp" #include "deviceStream.hpp" #include "surface.hpp" #include <gpu/buffer.hpp> #include <algorithm> inline VideoStitch::Status _setKernelParameters(cl_kernel, int) { return VideoStitch::Status::OK(); } // do nothing, terminating function template <typename T, typename... Args> inline VideoStitch::Status _setKernelParameters(cl_kernel kernel, int i, const VideoStitch::GPU::Buffer<T>& firstParameter, const Args&... restOfParameters) { const cl_mem rawPtr = firstParameter.get(); return _setKernelParameters(kernel, i, rawPtr, restOfParameters...); } template <typename... Args> inline VideoStitch::Status _setKernelParameters(cl_kernel kernel, int i, const VideoStitch::GPU::DeviceSurface& firstParameter, const Args&... restOfParameters) { const cl_mem rawPtr = firstParameter; return _setKernelParameters(kernel, i, rawPtr, restOfParameters...); } template <typename T, typename... Args> inline VideoStitch::Status _setKernelParameters(cl_kernel kernel, int i, const T& firstParameter, const Args&... restOfParameters) { static_assert(!std::is_same<T, VideoStitch::GPU::Stream>::value, "A GPU::Stream should never be passed as an OpenCL kernel argument"); PROPAGATE_CL_ERR(clSetKernelArg(kernel, i, sizeof(T), &firstParameter)); return _setKernelParameters(kernel, i + 1, restOfParameters...); } template <typename... Args> inline VideoStitch::Status setKernelParameters(cl_kernel kernel, const Args&... args) { return _setKernelParameters(kernel, 0, args...); // first number of parameter is 0 } namespace VideoStitch { namespace GPU { template <unsigned WorkDimension> class KernelExecution { public: template <typename... Args> Status enqueueWithKernelArgs(const Args&... args) { // lazy status evaluation, so the caller has to do less checking FAIL_RETURN(potKernel.getStatus()); if (global[0] == 0) { return {Origin::GPU, ErrType::ImplementationError, "Trying to run kernel with work dimension x of 0"}; } cl_kernel kernel = potKernel.getKernel(); auto enqueueFunction = [&]() -> Status { FAIL_RETURN(setKernelParameters(kernel, args...)); bool letDriverChooseLocalWorkSize = std::none_of(local.begin(), local.end(), [](size_t val) { return val > 0; }); if (letDriverChooseLocalWorkSize) { return CL_ERROR(clEnqueueNDRangeKernel(stream.get(), kernel, WorkDimension, nullptr, global.data(), nullptr, 0, nullptr, nullptr)); } else { return CL_ERROR(clEnqueueNDRangeKernel(stream.get(), kernel, WorkDimension, nullptr, global.data(), local.data(), 0, nullptr, nullptr)); } }; potKernel.lock(); // OpenCL API: // "The behavior of the cl_kernel object is undefined if clSetKernelArg is called // from multiple host threads on the same cl_kernel object at the same time." Status enqueueStatus = enqueueFunction(); potKernel.unlock(); return enqueueStatus; } private: friend class Kernel; KernelExecution(CLKernel& kernel, GPU::Stream stream, std::array<size_t, WorkDimension> global, std::array<size_t, WorkDimension> local) : potKernel(kernel), stream(stream), local(local), global(global) {} CLKernel& potKernel; GPU::Stream stream; std::array<size_t, WorkDimension> local; std::array<size_t, WorkDimension> global; }; class Kernel { public: // Request a kernel from the OpenCL context. // Potentially invalid: late checking usually at .enqueueWithArgs() // Use getInitStatus() to check for success immediately static Kernel get(std::string programName, std::string kernelName); // Does the OpenCL program exist, was the kernel found in the program // Did it compile succesfully? Status getInitStatus() const { return kernel.getStatus(); } // Prepare a 1D kernel to run on stream, with global work group size totalSize // Let the compiler choose the local work group size KernelExecution<1> setup1D(GPU::Stream stream, unsigned totalSize) const; // Prepare a 1D kernel to run on stream, with global work group size totalSize // Enforce a local work group size KernelExecution<1> setup1D(GPU::Stream stream, unsigned totalSize, unsigned blockSize) const; // Prepare a 2D kernel to run on stream, with global work group size totalWidth/Height // Let the compiler choose the local work group size KernelExecution<2> setup2D(GPU::Stream stream, unsigned totalWidth, unsigned totalHeight) const; // Prepare a 2D kernel to run on stream, with global work group size totalWidth/Height // Enforce a local work group size KernelExecution<2> setup2D(GPU::Stream stream, unsigned totalWidth, unsigned totalHeight, unsigned blockSizeX, unsigned blockSizeY) const; // Prepare a 2D kernel to run on stream, with global work group size totalWidth/Height // Local work group size identical in both dimensions KernelExecution<2> setup2D(GPU::Stream stream, unsigned totalWidth, unsigned totalHeight, unsigned blockSize) const { return setup2D(stream, totalWidth, totalHeight, blockSize, blockSize); } private: explicit Kernel(CLKernel& kernel) : kernel(kernel) {} CLKernel& kernel; }; } // namespace GPU } // namespace VideoStitch