1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm
#include "gpu/core1/boundsKernel.hpp"
#include "deviceBuffer.hpp"
#include "deviceStream.hpp"
#include <cuda/error.hpp>
#include <cuda/util.hpp>
#define REDUCE_THREADS_PER_BLOCK 512
namespace VideoStitch {
namespace Core {
namespace {
/**
* This kernel computes the OR of all pixels in each row, and pouts the result in
* colHasImage
* FIXME do it with parallel reduction
*/
__global__ void vertOrKernel(const uint32_t* __restrict__ contrib, uint32_t* __restrict__ colHasImage,
unsigned panoWidth, unsigned panoHeight) {
unsigned col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < panoWidth) {
uint32_t accum = 0;
for (unsigned row = 0; row < panoHeight; ++row) {
accum |= contrib[panoWidth * row + col];
}
colHasImage[col] = accum;
}
}
__global__ void horizOrKernel(const uint32_t* __restrict__ contrib, uint32_t* __restrict__ rowHasImage,
unsigned panoWidth, unsigned panoHeight) {
unsigned row = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t* rowp = contrib + panoWidth * row;
if (row < panoHeight) {
uint32_t accum = 0;
for (unsigned col = 0; col < panoWidth; ++col) {
accum |= rowp[col];
}
rowHasImage[row] = accum;
}
}
} // namespace
Status vertOr(std::size_t croppedWidth, std::size_t croppedHeight, GPU::Buffer<const uint32_t> contrib,
GPU::Buffer<uint32_t> colHasImage, GPU::Stream stream) {
dim3 dimBlock(REDUCE_THREADS_PER_BLOCK, 1, 1);
const unsigned numBlocks = (unsigned)Cuda::ceilDiv(croppedWidth, dimBlock.x);
dim3 dimGrid(numBlocks, 1, 1);
vertOrKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(contrib.get(), colHasImage.get(), (unsigned)croppedWidth,
(unsigned)croppedHeight);
return CUDA_STATUS;
}
Status horizOr(std::size_t croppedWidth, std::size_t croppedHeight, GPU::Buffer<const uint32_t> contrib,
GPU::Buffer<uint32_t> rowHasImage, GPU::Stream stream) {
dim3 dimBlock(REDUCE_THREADS_PER_BLOCK, 1, 1);
const unsigned numBlocks = (unsigned)Cuda::ceilDiv(croppedHeight, dimBlock.x);
dim3 dimGrid(numBlocks, 1, 1);
horizOrKernel<<<dimGrid, dimBlock, 0, stream.get()>>>(contrib.get(), rowHasImage.get(), (unsigned)croppedWidth,
(unsigned)croppedHeight);
return CUDA_STATUS;
}
} // namespace Core
} // namespace VideoStitch