// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #include "mask/mergerMask.hpp" #include "backend/common/imageOps.hpp" #include "backend/cuda/deviceBuffer.hpp" #include "backend/cuda/deviceStream.hpp" #include "backend/cuda/surface.hpp" #include "cuda/util.hpp" #include "gpu/core1/voronoi.hpp" #include "gpu/memcpy.hpp" #include "mask/mergerMaskConstant.hpp" namespace VideoStitch { namespace MergerMask { #define MERGER_MASK_KERNEL_SIZE_X 16 #define MERGER_MASK_KERNEL_SIZE_Y 16 __global__ void updateInputIndexByDistortionMapKernel( const videoreaderid_t camId, const unsigned char distortionThreshold, const int2 camSize, const int2 camOffset, const unsigned char* __restrict__ camDistortionBuffer, const int2 inputSize, const uint32_t* __restrict__ inputNonOverlappingIndexBuffer, const unsigned char* __restrict__ inputDistortionBuffer, uint32_t* __restrict__ nextNonOverlappingIndexBuffer, unsigned char* __restrict__ nextDistortionBuffer) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < inputSize.x && y < inputSize.y) { const int inputIndex = y * inputSize.x + x; const unsigned char inputDistortion = inputDistortionBuffer[inputIndex]; const uint32_t inputNonOverlappingIndex = inputNonOverlappingIndexBuffer[inputIndex]; const int camX = (x - camOffset.x + inputSize.x) % inputSize.x; const int camY = y - camOffset.y; nextNonOverlappingIndexBuffer[inputIndex] = inputNonOverlappingIndex; nextDistortionBuffer[inputIndex] = inputDistortion; if (camX >= 0 && camX < camSize.x && camY >= 0 && camY < camSize.y) { const int camIndex = camY * camSize.x + camX; const unsigned char camDistortion = camDistortionBuffer[camIndex]; if ((camDistortion < inputDistortion && inputDistortion > distortionThreshold) || (inputNonOverlappingIndex == 0 && camDistortion < 255)) { nextNonOverlappingIndexBuffer[inputIndex] = 1 << camId; nextDistortionBuffer[inputIndex] = camDistortion; } } } } Status MergerMask::updateInputIndexByDistortionMap(const videoreaderid_t camId, const int2 inputSize, const GPU::Buffer inputNonOverlappingIndexBuffer, const GPU::Buffer inputDistortionBuffer, GPU::Buffer nextNonOverlappingIndexBuffer, GPU::Buffer nextDistortionBuffer, GPU::Stream stream, const bool original) { const int2 camSize = original ? make_int2((int)cachedOriginalMappedRects[camId].getWidth(), (int)cachedOriginalMappedRects[camId].getHeight()) : make_int2((int)cachedMappedRects[camId].getWidth(), (int)cachedMappedRects[camId].getHeight()); const int2 camOffset = original ? make_int2((int)cachedOriginalMappedRects[camId].left(), (int)cachedOriginalMappedRects[camId].top()) : make_int2((int)cachedMappedRects[camId].left(), (int)cachedMappedRects[camId].top()); const unsigned char distortionThreshold = mergerMaskConfig.getDistortionThreshold(); dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(inputSize.y, dimBlock.y), 1); updateInputIndexByDistortionMapKernel<<>>( camId, distortionThreshold, camSize, camOffset, original ? originalDistortionMaps[camId].get() : distortionMaps[camId].get(), inputSize, inputNonOverlappingIndexBuffer.get(), inputDistortionBuffer.get(), nextNonOverlappingIndexBuffer.get(), nextDistortionBuffer.get()); return CUDA_STATUS; } __global__ void updateDistortionFromMaskKernel(videoreaderid_t camId, const int2 camSize, const int2 camOffset, unsigned char* __restrict__ camDistortionBuffer, const int2 inputSize, const uint32_t* __restrict__ srcMapBuffer) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < camSize.x && y < camSize.y) { int inputX = (x + camOffset.x) % inputSize.x; int inputY = (y + camOffset.y); if (inputX >= 0 && inputX < inputSize.x && inputY >= 0 && inputY < inputSize.y) { for (int i = -2; i <= 2; i++) { for (int j = -2; j <= 2; j++) { const int neiX = inputX + i; const int neiY = inputY + j; if (neiX >= 0 && neiX < inputSize.x && neiY >= 0 && neiY < inputSize.y) { if ((srcMapBuffer[neiY * inputSize.x + neiX] & (1 << camId)) == 0) { camDistortionBuffer[y * camSize.x + x] = 255; return; } } } } } } } Status MergerMask::updateDistortionFromMask(const videoreaderid_t camId, const int2 distortionBufferSize, const int2 distortionBufferOffset, GPU::Buffer distortionBuffer, const int2 inputSize, const GPU::Buffer srcMap, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(distortionBufferSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(distortionBufferSize.y, dimBlock.y), 1); updateDistortionFromMaskKernel<<>>( camId, distortionBufferSize, distortionBufferOffset, distortionBuffer.get(), inputSize, srcMap.get()); return CUDA_STATUS; } __global__ void initializeMasksKernel(videoreaderid_t camId, const int2 camSize, const int2 camOffset, const unsigned char* __restrict__ camDistortionBuffer, const int2 inputSize, uint32_t* __restrict__ inputNonOverlappingIndexBuffer, unsigned char* inputDistortionBuffer) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < inputSize.x && y < inputSize.y) { const int inputIndex = y * inputSize.x + x; const int camX = (x - camOffset.x + inputSize.x) % inputSize.x; const int camY = y - camOffset.y; inputNonOverlappingIndexBuffer[inputIndex] = 0; inputDistortionBuffer[inputIndex] = 255; if (camX >= 0 && camX < camSize.x && camY >= 0 && camY < camSize.y) { unsigned char camDistortion = camDistortionBuffer[camY * camSize.x + camX]; if (camDistortion < 255) { inputDistortionBuffer[inputIndex] = camDistortion; inputNonOverlappingIndexBuffer[inputIndex] = 1 << camId; } } } } Status MergerMask::initializeMasks(const int2 inputSize, const videoreaderid_t camId, GPU::Buffer inputNonOverlappingIndexBuffer, GPU::Buffer inputDistortionBuffer, GPU::Stream stream, const bool original) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); const Core::Rect camRect = original ? cachedOriginalMappedRects[camId] : cachedMappedRects[camId]; dim3 dimGrid((unsigned)Cuda::ceilDiv(inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(inputSize.y, dimBlock.y), 1); initializeMasksKernel<<>>( camId, make_int2((int)camRect.getWidth(), (int)camRect.getHeight()), make_int2((int)camRect.left(), (int)camRect.top()), original ? originalDistortionMaps[camId].get() : distortionMaps[camId].get(), inputSize, inputNonOverlappingIndexBuffer.get(), inputDistortionBuffer.get()); return CUDA_STATUS; } __global__ void transformDistortionKernel(const int2 inputSize, const float distortionParam, unsigned char* __restrict__ distortionBuffer) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < inputSize.x && y < inputSize.y) { const unsigned index = y * inputSize.x + x; const unsigned char inputDistortion = distortionBuffer[index]; unsigned char remappedDistortion = (unsigned char)(pow(float(inputDistortion) / 255.0f, distortionParam) * 255.0f); distortionBuffer[index] = remappedDistortion; } } Status MergerMask::transformDistortion(const int2 inputSize, GPU::Buffer distortionBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(inputSize.y, dimBlock.y), 1); const float distortionParam = mergerMaskConfig.getDistortionParam(); transformDistortionKernel<<>>(inputSize, distortionParam, distortionBuffer.get()); return CUDA_STATUS; } __global__ void updateIndexMaskKernel(const videoreaderid_t camId, const int maxOverlappingCount, const char* const __restrict__ cameraIndices, const int2 distortionBufferSize, const int2 distortionBufferOffset, const unsigned char* const __restrict__ distortionBuffer, const int2 size, uint32_t* __restrict__ inputIndexBuffer, unsigned char* __restrict__ mask, const uint32_t* const __restrict__ srcMap) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < size.x && y < size.y) { const unsigned index = y * size.x + x; uint32_t inputIndex = inputIndexBuffer[index]; if ((mask[index] < 255) && (srcMap[index] & (1 << camId))) { int coordX = (x - distortionBufferOffset.x + size.x) % size.x; int coordY = y - distortionBufferOffset.y; // Make sure to only put pixels those are not very distorted if (coordX >= 0 && coordX < distortionBufferSize.x && coordY >= 0 && coordY < distortionBufferSize.y) { unsigned char distortion = distortionBuffer[coordY * distortionBufferSize.x + coordX]; // if this pixel is already occupied and the distortion is large, just ignore it if (inputIndex > 0 && distortion > 130) { mask[index] = 0; return; } } int countBitOne = 0; int count = 0; int minCount = -1; while (inputIndex > 0) { if (inputIndex & 1) { countBitOne++; if (minCount < 0) { minCount = count; } else if (cameraIndices[count] < cameraIndices[minCount]) { minCount = count; } } inputIndex = inputIndex >> 1; count++; } if (countBitOne < maxOverlappingCount) { inputIndexBuffer[index] |= (1 << camId); } else if (countBitOne == maxOverlappingCount) { inputIndexBuffer[index] = (inputIndexBuffer[index] - (1 << minCount)) | (1 << camId); } mask[index] = 255; } else { mask[index] = 0; } } } Status MergerMask::updateIndexMask(const videoreaderid_t camId, const int maxOverlappingCount, const GPU::Buffer cameraIndices, const int2 distortionBufferSize, const int2 distortionBufferOffset, const GPU::Buffer distortionBuffer, const int2 inputSize, GPU::Buffer inputIndexBuffer, GPU::Buffer mask, const GPU::Buffer srcMap, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(inputSize.y, dimBlock.y), 1); updateIndexMaskKernel<<>>( camId, maxOverlappingCount, cameraIndices.get(), distortionBufferSize, distortionBufferOffset, distortionBuffer.get(), inputSize, inputIndexBuffer.get(), mask.get(), srcMap.get()); return CUDA_STATUS; } // Get the first 1 bit (from right to left), set it to 0 // For example input number = 1000100 --> return 2 and set number = 1000 // @NOTE: Faster implementation can be found at : https://graphics.stanford.edu/~seander/bithacks.html __device__ int getFirstOnBitPosition(uint32_t& number) { int x = -1; int count = 0; while ((number > 0) && ((number & 1) == 0)) { count++; number = number >> 1; } if ((number & 1) > 0) { x = count; number = number >> 1; } return x; } // Get index of the first two bit with value 1 __device__ int2 getFirstTwoOnBitPosition(const uint32_t input) { uint32_t number = input; int32_t x = getFirstOnBitPosition(number); int32_t y = -1; if (x >= 0) { int32_t offsetY = getFirstOnBitPosition(number); if (offsetY >= 0) { y = x + offsetY + 1; } } return make_int2(x, y); } __global__ void updateStitchingCostKernel(const size_t camCount, const int2 size, const int kernelSize, const uint32_t* __restrict__ inputIndexBuffer, const uint32_t* __restrict__ mappedOffset, const int2* __restrict__ mappedRectOffset, const int2* __restrict__ mappedRectSize, const uint32_t* __restrict__ mappedBuffer, float* __restrict__ cost, uint32_t* __restrict__ debugBuffer0, uint32_t* __restrict__ debugBuffer1) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < size.x && y < size.y) { const unsigned index = y * size.x + x; debugBuffer0[index] = 0; debugBuffer1[index] = 0; const uint32_t inputIndex = inputIndexBuffer[index]; const int2 firstTwo = getFirstTwoOnBitPosition(inputIndex); if (firstTwo.x >= 0 && firstTwo.y >= 0) { const int input0 = firstTwo.x; const int input1 = firstTwo.y; debugBuffer1[index] = 1 << input0 + 1 << input1; int x0 = (x - mappedRectOffset[input0].x + size.x) % size.x; int y0 = y - mappedRectOffset[input0].y; const unsigned index0 = y0 * mappedRectSize[input0].x + x0; const uint32_t color0 = mappedBuffer[mappedOffset[input0] + index0]; debugBuffer0[index] = color0; int x1 = x - mappedRectOffset[input1].x; int y1 = y - mappedRectOffset[input1].y; const unsigned index1 = y1 * mappedRectSize[input1].x + x1; const uint32_t color1 = mappedBuffer[mappedOffset[input1] + index1]; debugBuffer1[index] = color1; // Update stitching cost using min pooling metric const int left = max(x1 - kernelSize, 0); const int right = min(x1 + kernelSize, int(mappedRectSize[input1].x - 1)); const int top = max(y1 - kernelSize, 0); const int bottom = min(y1 + kernelSize, int(mappedRectSize[input1].y - 1)); float sadMin = -1; for (int i = left; i <= right; i++) { for (int j = top; j <= bottom; j++) { const unsigned warpI = (i + size.x) % size.x; const unsigned index1 = j * mappedRectSize[input1].x + warpI; const uint32_t color1 = mappedBuffer[mappedOffset[input1] + index1]; if (color1 != INVALID_VALUE) { const float sadLab = abs((float(Image::RGBA::r(color0)) - Image::RGBA::r(color1)) / 255.0) + abs((float(Image::RGBA::g(color0)) - Image::RGBA::g(color1)) / 255.0) + abs((float(Image::RGBA::b(color0)) - Image::RGBA::b(color1)) / 255.0); const float sadGradient = abs((float(Image::RGBA::a(color0)) - Image::RGBA::a(color1)) / 255.0); const float sad = (sadLab + 2.0f * sadGradient) / (1.0f + 2.0f); if (sad < sadMin || sadMin < 0) { sadMin = sad; } } } } if (sadMin >= 0) { // Prefer to focus all effort in the middle of the output panorama, give these pixels more weights float yDistance = min(1.0f, (float)(abs((size.y / 2) - y)) / (size.y / 2)); float yCost = max(0.0f, expf(yDistance * yDistance * (-0.5f))); cost[index] += max(sadMin * yCost, 0.001); } } } } Status MergerMask::updateStitchingCost(const int2 inputSize, const int kernelSize, const GPU::Buffer inputIndexBuffer, const GPU::Buffer mappedOffset, const GPU::Buffer mappedRectOffset, const GPU::Buffer mappedRectSize, const GPU::Buffer mappedBuffer, GPU::Buffer cost, GPU::Buffer debugBuffer0, GPU::Buffer debugBuffer1, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(inputSize.y, dimBlock.y), 1); updateStitchingCostKernel<<>>( pano.numInputs(), inputSize, kernelSize, inputIndexBuffer.get(), mappedOffset.get(), mappedRectOffset.get(), mappedRectSize.get(), mappedBuffer.get(), cost.get(), debugBuffer0.get(), debugBuffer1.get()); return CUDA_STATUS; } __global__ void extractLayerFromIndexBufferKernel(const videoreaderid_t id, int2 bufferSize, const uint32_t* const __restrict__ inputBuffer, uint32_t* __restrict__ extractedBuffer) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < bufferSize.x && y < bufferSize.y) { const unsigned index = y * bufferSize.x + x; const uint32_t input = inputBuffer[index]; if ((input & id) > 0) { extractedBuffer[index] = id; } else { extractedBuffer[index] = 0; } } } Status MergerMask::extractLayerFromIndexBuffer(const videoreaderid_t id, const int2 bufferSize, const GPU::Buffer inputIndexBuffer, GPU::Buffer extractedBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(bufferSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(bufferSize.y, dimBlock.y), 1); extractLayerFromIndexBufferKernel<<>>(id, bufferSize, inputIndexBuffer.get(), extractedBuffer.get()); return CUDA_STATUS; } __global__ void updateIndexMaskAfterSeamKernel(const videoreaderid_t id0s, const videoreaderid_t id1, int2 bufferSize, const unsigned char* const __restrict__ seamBuffer, uint32_t* __restrict__ indexBuffer) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < bufferSize.x && y < bufferSize.y) { const unsigned index = y * bufferSize.x + x; uint32_t input = indexBuffer[index]; const uint32_t seam = seamBuffer[index]; if (seam == (1 << 0)) { if ((input & id1) == id1) { input -= id1; } } else if (seam == (1 << 1)) { if ((input & id0s) > 0) { input = (input & (~id0s)); } } indexBuffer[index] = input; } } Status MergerMask::updateIndexMaskAfterSeam(const videoreaderid_t id0s, const videoreaderid_t id1, const int2 bufferSize, const GPU::Buffer seamBuffer, GPU::Buffer indexBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(bufferSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(bufferSize.y, dimBlock.y), 1); updateIndexMaskAfterSeamKernel<<>>(id0s, id1, bufferSize, seamBuffer.get(), indexBuffer.get()); return CUDA_STATUS; } __global__ void lookupColorBufferFromInputIndexKernel( const int wrapWidth, const int camCount, const unsigned char* const __restrict__ cameraIndices, const int2* __restrict__ const mappedRectOffsets, const int2* __restrict__ const mappedRectSizes, const uint32_t* __restrict__ const mappedOffsets, const uint32_t* __restrict__ const mappedBuffer, const int2 bufferSize, const uint32_t* const __restrict__ inputIndexBuffer, uint32_t* __restrict__ outputBuffer) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < bufferSize.x && y < bufferSize.y) { const unsigned index = y * bufferSize.x + x; outputBuffer[index] = INVALID_VALUE; // If the signal is on const uint32_t inputIndex = inputIndexBuffer[index]; for (int i = camCount - 1; i >= 0; i--) if ((inputIndex & (1 << cameraIndices[i])) > 0) { unsigned char camIndex = cameraIndices[i]; uint32_t camOffset = mappedOffsets[camIndex]; int2 camRectOffset = mappedRectOffsets[camIndex]; int2 camRectSize = mappedRectSizes[camIndex]; int32_t camX = (x - camRectOffset.x + wrapWidth) % wrapWidth; int32_t camY = y - camRectOffset.y; if (camX >= 0 && camX < camRectSize.x && camY >= 0 && camY < camRectSize.y) { int32_t camIndex = camY * camRectSize.x + camX; outputBuffer[index] = mappedBuffer[camOffset + camIndex]; } break; } } } Status MergerMask::lookupColorBufferFromInputIndex( const int wrapWidth, const GPU::Buffer camBuffer, const GPU::Buffer mappedRectOffsets, const GPU::Buffer mappedRectSizes, const GPU::Buffer mappedOffsets, const GPU::Buffer mappedBuffers, const int2 bufferSize, const GPU::Buffer inputIndexBuffer, GPU::Buffer outputBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(bufferSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(bufferSize.y, dimBlock.y), 1); lookupColorBufferFromInputIndexKernel<<>>( wrapWidth, (int)camBuffer.numElements(), camBuffer.get(), mappedRectOffsets.get(), mappedRectSizes.get(), mappedOffsets.get(), mappedBuffers.get(), bufferSize, inputIndexBuffer.get(), outputBuffer.get()); return CUDA_STATUS; } __global__ void updateSeamMaskKernel(const videoreaderid_t id, const int2 size, const uint32_t* __restrict__ const originalInputIndexBuffer, const unsigned char* const __restrict__ distanceBuffer, uint32_t* __restrict__ seamOuputIndexBuffer) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < size.x && y < size.y) { const unsigned index = y * size.x + x; if ((originalInputIndexBuffer[index] & (1 << id)) > 0) { if (distanceBuffer[index] < 255) { seamOuputIndexBuffer[index] |= (1 << id); } } } } Status MergerMask::updateSeamMask(const videoreaderid_t id, const int2 size, const GPU::Buffer originalInputIndexBuffer, const GPU::Buffer distanceBuffer, GPU::Buffer seamOuputIndexBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(size.x, dimBlock.x), (unsigned)Cuda::ceilDiv(size.y, dimBlock.y), 1); updateSeamMaskKernel<<>>(id, size, originalInputIndexBuffer.get(), distanceBuffer.get(), seamOuputIndexBuffer.get()); return CUDA_STATUS; } __global__ void getInputMaskFromOutputIndicesKernel(const videoreaderid_t imId, const int scaleFactor, const int2 outputSize, const uint32_t* __restrict__ const maskBuffer, const int2 inputSize, const float2* __restrict__ const inputCoordBuffer, unsigned char* const __restrict__ inputMask) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < scaleFactor * inputSize.x && y < scaleFactor * inputSize.y) { const unsigned index = y * inputSize.x * scaleFactor + x; const float2 coord = inputCoordBuffer[index]; if (coord.x < 0 || coord.y < 0) { return; } const int2 roundedCoord = make_int2(roundf(coord.x), roundf(coord.y)); inputMask[index] = 0; if (roundedCoord.x >= 0 && roundedCoord.x < outputSize.x && roundedCoord.y >= 0 && roundedCoord.y < outputSize.y) { if ((maskBuffer[roundedCoord.y * outputSize.x + (roundedCoord.x % outputSize.x)] & (1 << imId)) > 0) { inputMask[index] = 255; } } } } Status MergerMask::getInputMaskFromOutputIndices(const videoreaderid_t imId, const int scaleFactor, const int2 outputSize, const GPU::Buffer maskBuffer, const int2 inputSize, const GPU::Buffer inputCoordBuffer, GPU::Buffer inputMask, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(scaleFactor * inputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(scaleFactor * inputSize.y, dimBlock.y), 1); FAIL_RETURN( GPU::memsetToZeroBlocking(inputMask, inputSize.x * inputSize.y * scaleFactor * scaleFactor)); getInputMaskFromOutputIndicesKernel<<>>( imId, scaleFactor, outputSize, maskBuffer.get(), inputSize, inputCoordBuffer.get(), inputMask.get()); return CUDA_STATUS; } __global__ void getOutputIndicesFromInputMaskKernel(const videoreaderid_t imId, const int scaleFactor, const int2 inputSize, const unsigned char* const __restrict__ inputMask, const int2 outputSize, cudaTextureObject_t coordBuffer, uint32_t* __restrict__ const maskBuffer) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < outputSize.x && y < outputSize.y) { const unsigned index = y * outputSize.x + x; const float2 coord = tex2D(coordBuffer, x, y); const int2 roundedCoord = make_int2(roundf(coord.x * scaleFactor), roundf(coord.y * scaleFactor)); if (roundedCoord.x >= 0 && roundedCoord.x < scaleFactor * inputSize.x && roundedCoord.y >= 0 && roundedCoord.y < scaleFactor * inputSize.y) { if (inputMask[roundedCoord.y * (scaleFactor * inputSize.x) + roundedCoord.x] > 0) { maskBuffer[index] |= (1 << imId); } } } } Status MergerMask::getOutputIndicesFromInputMask(const videoreaderid_t imId, const int scaleFactor, const int2 inputSize, const GPU::Buffer inputMask, const int2 outputSize, const GPU::Surface& coordBuffer, GPU::Buffer maskBuffer, GPU::Stream stream) { dim3 dimBlock(MERGER_MASK_KERNEL_SIZE_X, MERGER_MASK_KERNEL_SIZE_Y, 1); dim3 dimGrid((unsigned)Cuda::ceilDiv(outputSize.x, dimBlock.x), (unsigned)Cuda::ceilDiv(outputSize.y, dimBlock.y), 1); getOutputIndicesFromInputMaskKernel<<>>( imId, scaleFactor, inputSize, inputMask.get(), outputSize, coordBuffer.get().texture(), maskBuffer.get()); return CUDA_STATUS; } } // namespace MergerMask } // namespace VideoStitch