1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm
#include "gpu/image/reduce.hpp"
#include "backend/common/imageOps.hpp"
#include "backend/cuda/deviceBuffer.hpp"
#include <cuda/error.hpp>
#include <cuda/util.hpp>
#include <cuda_runtime.h>
#include <cassert>
#include <stdint.h>
#include <stdio.h>
namespace VideoStitch {
namespace Image {
// See "Optimizing Parallel Reduction in CUDA" Mark Haris
template <uint32_t (*getValue)(uint32_t)>
__global__ void reduceKernel(uint32_t* __restrict__ dst, const uint32_t* __restrict__ src, unsigned size) {
extern __shared__ uint32_t sdata[];
const unsigned tid = threadIdx.x;
unsigned i = blockIdx.x * (BLOCKSIZE * 2) + tid;
const unsigned gridSize = BLOCKSIZE * 2 * gridDim.x;
uint32_t startVal = 0;
while (i < size) {
startVal += getValue(src[i]);
if (i + BLOCKSIZE < size) {
startVal += getValue(src[i + BLOCKSIZE]);
}
i += gridSize;
}
sdata[tid] = startVal;
__syncthreads();
if (tid < 128) {
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
if (tid < 64) {
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
if (tid < 32) {
// No need to sync, only one warp. But fermi needs volatile !
volatile uint32_t* localSdata = sdata;
localSdata[tid] += localSdata[tid + 32];
localSdata[tid] += localSdata[tid + 16];
localSdata[tid] += localSdata[tid + 8];
localSdata[tid] += localSdata[tid + 4];
localSdata[tid] += localSdata[tid + 2];
localSdata[tid] += localSdata[tid + 1];
}
if (tid == 0) {
dst[blockIdx.x] = sdata[0];
}
}
namespace {
inline __device__ uint32_t identity(uint32_t value) { return value; }
inline __device__ uint32_t isSolid(uint32_t value) { return RGBA::a(value) > 0 ? 1 : 0; }
inline __device__ uint32_t solidIdentityOrZero(uint32_t value) {
return RGBA::a(value) > 0 ? (Image::RGBA::r(value) + Image::RGBA::g(value) + Image::RGBA::b(value)) / 3 : 0;
}
} // namespace
template <uint32_t (*getValue)(uint32_t)>
Status reduce(const uint32_t* src, uint32_t* work, std::size_t size, uint32_t& result) {
const dim3 dimBlock(BLOCKSIZE);
// The first pass uses the actual getValue.
if (size > 1) {
const dim3 dimGrid((unsigned)Cuda::ceilDiv(size, 2 * BLOCKSIZE));
reduceKernel<getValue><<<dimGrid, dimBlock, 4 * BLOCKSIZE>>>(work, src, (unsigned)size);
size = dimGrid.x;
src = work;
work = work + size;
}
// Other passes simply sum.
while (size > 1) {
const dim3 dimGrid((unsigned)Cuda::ceilDiv(size, 2 * BLOCKSIZE));
reduceKernel<identity><<<dimGrid, dimBlock, 4 * BLOCKSIZE>>>(work, src, (unsigned)size);
size = dimGrid.x;
src = work;
work = work + size;
}
FAIL_RETURN(CUDA_STATUS);
return CUDA_ERROR(cudaMemcpy(&result, src, sizeof(uint32_t), cudaMemcpyDeviceToHost));
}
Status reduceSum(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size, uint32_t& result) {
return reduce<identity>(src.get(), work.get(), size, result);
}
Status reduceSumSolid(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size, uint32_t& result) {
return reduce<solidIdentityOrZero>(src.get(), work.get(), size, result);
}
Status reduceCountSolid(GPU::Buffer<const uint32_t> src, GPU::Buffer<uint32_t> work, std::size_t size,
uint32_t& result) {
return reduce<isSolid>(src.get(), work.get(), size, result);
}
} // namespace Image
} // namespace VideoStitch