// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #include "gpu/coredepth/sweep.hpp" #include "backend/common/coredepth/sphereSweepParams.h" #include "../surface.hpp" #include "gpu/memcpy.hpp" #include "core/transformGeoParams.hpp" #include "libvideostitch/geometryDef.hpp" #include "libvideostitch/panoDef.hpp" #include "backend/cuda/deviceBuffer.hpp" #include "backend/cuda/deviceBuffer2D.hpp" #include "backend/cuda/surface.hpp" #include "backend/cuda/deviceStream.hpp" #include "cuda/util.hpp" #include "gpu/buffer.hpp" #include "kernels/sphereSweepKernel.cu" #include <math.h> static const int CudaBlockSize = 16; namespace VideoStitch { namespace GPU { static int numCall = 0; PotentialValue<struct InputParams6> prepareInputParams(const Core::PanoDefinition& panoDef, int time, float scale = 1.f) { struct InputParams6 inputParamsArray; for (videoreaderid_t videoInputID = 0; videoInputID < panoDef.numVideoInputs(); videoInputID++) { const Core::InputDefinition& input = panoDef.getVideoInput(videoInputID); const Core::GeometryDefinition geometry = input.getGeometries().at(time); Core::TransformGeoParams params(input, geometry, panoDef); if (geometry.hasDistortion()) { return PotentialValue<struct InputParams6>({Origin::Stitcher, ErrType::ImplementationError, "Sphere sweep does not handle distortion parameters in inputs"}); } float2 center, iscale; center.x = (float)input.getCenterX(geometry) / scale; center.y = (float)input.getCenterY(geometry) / scale; iscale.x = (float)geometry.getHorizontalFocal() / scale; iscale.y = (float)geometry.getVerticalFocal() / scale; InputParams& inputParams = inputParamsArray.params[videoInputID]; inputParams.distortion = params.getDistortion(); inputParams.transform = params.getPose(); inputParams.inverseTransform = params.getPoseInverse(); inputParams.scale = iscale; inputParams.centerShift = center; inputParams.texWidth = (int)(input.getWidth() / scale); inputParams.texHeight = (int)(input.getHeight() / scale); inputParams.cropLeft = (int)(input.getCropLeft() / scale); inputParams.cropRight = (int)(input.getCropRight() / scale); inputParams.cropTop = (int)(input.getCropTop() / scale); inputParams.cropBottom = (int)(input.getCropBottom() / scale); } return PotentialValue<struct InputParams6>(inputParamsArray); } static read_only image2d_t getSurfaceFromMap(const videoreaderid_t index, const std::map<videoreaderid_t, Core::SourceSurface*>& surfaces) { return (surfaces.find(index) != surfaces.end()) ? surfaces.find(index)->second->pimpl->surface->get().texture() : 0; } Status splatInputWithDepthIntoPano(const Core::PanoDefinition& panoDef, Core::PanoSurface& pano, const GPU::Surface& depthSurface, const std::map<videoreaderid_t, Core::SourceSurface*>& inputSurfaces, GPU::Stream stream) { // TODO int time = 0; const videoreaderid_t inputID = 0; Buffer<uint32_t> panoBuffer = pano.pimpl->buffer; float2 pscale; pscale.x = Core::TransformGeoParams::computePanoScale(Core::PanoProjection::Equirectangular, pano.getWidth(), 360.f); pscale.y = 2 * Core::TransformGeoParams::computePanoScale(Core::PanoProjection::Equirectangular, pano.getHeight(), 360.f); auto potInputParamsArray = prepareInputParams(panoDef, time); FAIL_RETURN(potInputParamsArray.status()); const InputParams& referenceInput = potInputParamsArray.value().params[inputID]; const float offset = cosf(numCall++ / 20.f * (float)M_PI / 2.f) * 0.2f; const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1); const dim3 dimGrid((unsigned)Cuda::ceilDiv(referenceInput.texWidth, dimBlock.x), (unsigned)Cuda::ceilDiv(referenceInput.texHeight, dimBlock.y), 1); Core::splatInputWithDepthIntoPano<<<dimGrid, dimBlock, 0, stream.get()>>>( panoBuffer.get(), (unsigned)pano.getWidth(), (unsigned)pano.getHeight(), pscale, getSurfaceFromMap(inputID, inputSurfaces), depthSurface.get().surface(), referenceInput, panoDef.numVideoInputs(), offset); Logger::get(Logger::Info) << "SphereSweep frame " << numCall << std::endl; return Status::OK(); } Status sphereSweepInput(videoreaderid_t sourceID, int frame, GPU::Surface& dst, const std::map<videoreaderid_t, Core::SourceSurface*>& inputSurfaces, const Core::PanoDefinition& panoDef, GPU::Stream& stream, const float scale) { // debug command line pipeline: just copy input surface to output surface // via temporary buffer as we don't have a surface->surface copy function // auto tmpBuf = GPU::uniqueBuffer<uint32_t>(inputDef.getWidth() * inputDef.getHeight(), "tmp bfu"); // GPU::memcpyAsync(tmpBuf.borrow(), *gpuSurf, stream); // GPU::memcpyAsync(dst, tmpBuf.borrow_const(), stream); // stream.synchronize() if (panoDef.numVideoInputs() > maxDepthInputs()) { return Status{Origin::Stitcher, ErrType::ImplementationError, "Sphere sweep only implemented up to 6 inputs (hardcoded)"}; } auto potInputParamsArray = prepareInputParams(panoDef, frame, scale); FAIL_RETURN(potInputParamsArray.status()); const struct InputParams6 inputParamsArray = potInputParamsArray.releaseValue(); const InputParams& referenceInput = inputParamsArray.params[sourceID]; // Running a kernel that takes > 1s destabilizes the system // (Display manager resets or kernel panic) // As the current version is not optimised and works at full resolution it can take several seconds to complete // --> Tile the work. Each tile should complete in less than 1 second. const int numBlocks = 16; // Make sure texture width is a multiple of numBlocks const int paddedTexWidth = (int)Cuda::ceilDiv(referenceInput.texWidth, numBlocks) * numBlocks; const int paddedTexHeight = (int)Cuda::ceilDiv(referenceInput.texHeight, numBlocks) * numBlocks; for (int cx = 0; cx < numBlocks; cx++) { for (int cy = 0; cy < numBlocks; cy++) { const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1); const dim3 dimGrid((unsigned)Cuda::ceilDiv(paddedTexWidth / numBlocks, dimBlock.x), (unsigned)Cuda::ceilDiv(paddedTexHeight / numBlocks, dimBlock.y), 1); Core::sphereSweepInputKernel<<<dimGrid, dimBlock, 0, stream.get()>>>( dst.get().surface(), (unsigned)dst.width(), (unsigned)dst.height(), nullptr, getSurfaceFromMap(0, inputSurfaces), getSurfaceFromMap(1, inputSurfaces), getSurfaceFromMap(2, inputSurfaces), getSurfaceFromMap(3, inputSurfaces), getSurfaceFromMap(4, inputSurfaces), getSurfaceFromMap(5, inputSurfaces), inputParamsArray, sourceID, panoDef.numVideoInputs(), cx, cy, paddedTexWidth / numBlocks, paddedTexHeight / numBlocks); // Force synchronization after tile computation for system stability stream.synchronize(); } } Logger::get(Logger::Info) << "SphereSweep frame " << frame << " input " << sourceID << std::endl; return Status::OK(); } Status sphereSweepInputSGM(videoreaderid_t sourceID, int frame, GPU::Surface& dst, GPU::HostBuffer<unsigned short>& hostCostVolume, const std::map<videoreaderid_t, Core::SourceSurface*>& inputSurfaces, const Core::PanoDefinition& panoDef, GPU::Stream& stream, const float scale) { // debug command line pipeline: just copy input surface to output surface // via temporary buffer as we don't have a surface->surface copy function // auto tmpBuf = GPU::uniqueBuffer<uint32_t>(inputDef.getWidth() * inputDef.getHeight(), "tmp bfu"); // GPU::memcpyAsync(tmpBuf.borrow(), *gpuSurf, stream); // GPU::memcpyAsync(dst, tmpBuf.borrow_const(), stream); // stream.synchronize() if (panoDef.numVideoInputs() > maxDepthInputs()) { return Status{Origin::Stitcher, ErrType::ImplementationError, "Sphere sweep only implemented up to 6 inputs (hardcoded)"}; } auto potInputParamsArray = prepareInputParams(panoDef, frame, scale); FAIL_RETURN(potInputParamsArray.status()); const struct InputParams6 inputParamsArray = potInputParamsArray.releaseValue(); const InputParams& referenceInput = inputParamsArray.params[sourceID]; GPU::UniqueBuffer<unsigned short> devCostVolume; PROPAGATE_FAILURE_STATUS( devCostVolume.alloc(referenceInput.texWidth * referenceInput.texHeight * numSphereSweeps(), "SGM score volume")); // Running a kernel that takes > 1s destabilizes the system // (Display manager resets or kernel panic) // As the current version is not optimised and works at full resolution it can take several seconds to complete // --> Tile the work. Each tile should complete in less than 1 second. const int numBlocks = 16; // Make sure texture width is a multiple of numBlocks const int paddedTexWidth = (int)Cuda::ceilDiv(referenceInput.texWidth, numBlocks) * numBlocks; const int paddedTexHeight = (int)Cuda::ceilDiv(referenceInput.texHeight, numBlocks) * numBlocks; for (int cx = 0; cx < numBlocks; cx++) { for (int cy = 0; cy < numBlocks; cy++) { const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1); const dim3 dimGrid((unsigned)Cuda::ceilDiv(paddedTexWidth / numBlocks, dimBlock.x), (unsigned)Cuda::ceilDiv(paddedTexHeight / numBlocks, dimBlock.y), 1); Core::sphereSweepInputKernel<<<dimGrid, dimBlock, 0, stream.get()>>>( dst.get().surface(), (unsigned)dst.width(), (unsigned)dst.height(), devCostVolume.borrow().devicePtr(), getSurfaceFromMap(0, inputSurfaces), getSurfaceFromMap(1, inputSurfaces), getSurfaceFromMap(2, inputSurfaces), getSurfaceFromMap(3, inputSurfaces), getSurfaceFromMap(4, inputSurfaces), getSurfaceFromMap(5, inputSurfaces), inputParamsArray, sourceID, panoDef.numVideoInputs(), cx, cy, paddedTexWidth / numBlocks, paddedTexHeight / numBlocks); // Force synchronization after tile computation for system stability FAIL_RETURN(stream.synchronize()); } } Logger::get(Logger::Info) << "SphereSweep frame " << frame << " input " << sourceID << std::endl; // copy scoreVolume to host FAIL_RETURN(GPU::memcpyAsync( hostCostVolume.hostPtr(), devCostVolume.borrow_const(), referenceInput.texWidth * referenceInput.texHeight * numSphereSweeps() * sizeof(unsigned short), stream)); stream.synchronize(); Logger::get(Logger::Info) << "SphereSweep score volume copied back to host" << std::endl; return Status::OK(); } Status sphereSweepInputDisparityToDepth(videoreaderid_t sourceID, int frame, GPU::Surface& dst, GPU::HostBuffer<short>& hostDisparity, bool useHostDisparity, const std::map<videoreaderid_t, Core::SourceSurface*>& surfaces, const Core::PanoDefinition& panoDef, GPU::Stream& stream, const float scale) { if (panoDef.numVideoInputs() > maxDepthInputs()) { return Status{Origin::Stitcher, ErrType::ImplementationError, "Sphere sweep only implemented up to 6 inputs (hardcoded)"}; } auto potInputParamsArray = prepareInputParams(panoDef, frame, scale); FAIL_RETURN(potInputParamsArray.status()); const struct InputParams6 inputParamsArray = potInputParamsArray.releaseValue(); const InputParams& referenceInput = inputParamsArray.params[sourceID]; // copy host disparity to GPU buffer PotentialValue<GPU::Buffer<short>> potDevBuf = GPU::Buffer<short>::allocate(referenceInput.texWidth * referenceInput.texHeight, "SGM output disparity"); FAIL_RETURN(potDevBuf.status()); GPU::Buffer<short> devDisparity(potDevBuf.releaseValue()); FAIL_RETURN(GPU::memcpyAsync(devDisparity, hostDisparity.hostPtr(), referenceInput.texWidth * referenceInput.texHeight * sizeof(short), stream)); stream.synchronize(); // Running a kernel that takes > 1s destabilizes the system // (Display manager resets or kernel panic) // As the current version is not optimised and works at full resolution it can take several seconds to complete // --> Tile the work. Each tile should complete in less than 1 second. const int numBlocks = 4; // Make sure texture width is a multiple of numBlocks const int paddedTexWidth = (int)Cuda::ceilDiv(referenceInput.texWidth, numBlocks) * numBlocks; const int paddedTexHeight = (int)Cuda::ceilDiv(referenceInput.texHeight, numBlocks) * numBlocks; for (int cx = 0; cx < numBlocks; cx++) { for (int cy = 0; cy < numBlocks; cy++) { const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1); const dim3 dimGrid((unsigned)Cuda::ceilDiv(paddedTexWidth / numBlocks, dimBlock.x), (unsigned)Cuda::ceilDiv(paddedTexHeight / numBlocks, dimBlock.y), 1); Core::sphereSweepInputDisparityToDepthKernel<<<dimGrid, dimBlock, 0, stream.get()>>>( dst.get().surface(), (unsigned)dst.width(), (unsigned)dst.height(), (useHostDisparity) ? devDisparity.devicePtr() : nullptr, getSurfaceFromMap(sourceID, surfaces), cx, cy, paddedTexWidth / numBlocks, paddedTexHeight / numBlocks); // Force synchronization after tile computation for system stability stream.synchronize(); } } Logger::get(Logger::Info) << "SphereSweep disparity to depth on input " << sourceID << std::endl; FAIL_RETURN(devDisparity.release()); return Status(); } Status sphereSweepInputStep(videoreaderid_t sourceID, int frame, GPU::Surface& dst, GPU::Surface& depthSrcNextLevel, const std::map<videoreaderid_t, Core::SourceSurface*>& inputSurfaces, const Core::PanoDefinition& panoDef, GPU::Stream& stream, const float scale) { // debug command line pipeline: just copy input surface to output surface // via temporary buffer as we don't have a surface->surface copy function // auto tmpBuf = GPU::uniqueBuffer<uint32_t>(inputDef.getWidth() * inputDef.getHeight(), "tmp bfu"); // GPU::memcpyAsync(tmpBuf.borrow(), *gpuSurf, stream); // GPU::memcpyAsync(dst, tmpBuf.borrow_const(), stream); // stream.synchronize() if (panoDef.numVideoInputs() > 6) { return Status{Origin::Stitcher, ErrType::ImplementationError, "Sphere sweep only implemented for 6 inputs maximum (hardcoded)"}; } auto potInputParamsArray = prepareInputParams(panoDef, frame, scale); FAIL_RETURN(potInputParamsArray.status()); const struct InputParams6 inputParamsArray = potInputParamsArray.releaseValue(); const InputParams& referenceInput = inputParamsArray.params[sourceID]; // Running a kernel that takes > 1s destabilizes the system // (Display manager resets or kernel panic) // As the current version is not optimised and works at full resolution it can take several seconds to complete // --> Tile the work. Each tile should complete in less than 1 second. const int numBlocks = 4; const int paddedTexWidth = (int)Cuda::ceilDiv(referenceInput.texWidth, numBlocks) * numBlocks; const int paddedTexHeight = (int)Cuda::ceilDiv(referenceInput.texHeight, numBlocks) * numBlocks; // search around best depth from lower level pyramid // search span is in log2, covers [log2(bestDepth) - searchSpan, log2(bestDepth) + searchSpan] const float searchSpan = scale / 8.f; // decrease searched depths on upper levels for (int cx = 0; cx < numBlocks; cx++) { for (int cy = 0; cy < numBlocks; cy++) { const dim3 dimBlock(CudaBlockSize, CudaBlockSize, 1); const dim3 dimGrid((unsigned)Cuda::ceilDiv(paddedTexWidth / numBlocks, dimBlock.x), (unsigned)Cuda::ceilDiv(paddedTexHeight / numBlocks, dimBlock.y), 1); Core::sphereSweepInputKernelStep<<<dimGrid, dimBlock, 0, stream.get()>>>( dst.get().surface(), (unsigned)dst.width(), (unsigned)dst.height(), depthSrcNextLevel.get().surface(), (unsigned)depthSrcNextLevel.width(), (unsigned)depthSrcNextLevel.height(), getSurfaceFromMap(0, inputSurfaces), getSurfaceFromMap(1, inputSurfaces), getSurfaceFromMap(2, inputSurfaces), getSurfaceFromMap(3, inputSurfaces), getSurfaceFromMap(4, inputSurfaces), getSurfaceFromMap(5, inputSurfaces), inputParamsArray, sourceID, panoDef.numVideoInputs(), cx, cy, paddedTexWidth / numBlocks, paddedTexHeight / numBlocks, searchSpan); // Force synchronization after tile computation for system stability stream.synchronize(); } } Logger::get(Logger::Info) << "SphereSweep step frame " << frame << " input " << sourceID << " search span: " << searchSpan << std::endl; return Status::OK(); } int numSphereSweeps() { return numSphereScales; } int maxDepthInputs() { return NUM_INPUTS; } } // namespace GPU } // namespace VideoStitch