1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm
#ifndef SHARED_UTILS_HPP_
#define SHARED_UTILS_HPP_
// This part of the code is available only to CUDA.
#ifndef __CUDACC__
#error "You included this file in non-device code. This should not happen."
#endif
#include <stdio.h>
#include <math_constants.h>
namespace VideoStitch {
namespace Image {
/**
* Fixed constant boundary condition.
*/
template <typename T>
struct ZeroBoundary {
static inline __device__ T bottomRightValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const unsigned srcY) {
if (srcX < srcWidth && srcY < srcHeight) {
return src[srcWidth * srcY + srcX];
}
return (T)0;
}
static inline __device__ T leftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight, const int srcX,
const unsigned srcY) {
if (srcX >= 0 && srcY < srcHeight) {
return src[srcWidth * srcY + (unsigned)srcX];
}
return (T)0;
}
static inline __device__ T topValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const int srcY) {
if (srcX < srcWidth && srcY >= 0) {
return src[srcWidth * (unsigned)srcY + srcX];
}
return (T)0;
}
static inline __device__ T topLeftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const int srcY) {
if (srcX >= 0 && srcY >= 0) {
return src[srcWidth * (unsigned)srcY + (unsigned)srcX];
}
return (T)0;
}
};
struct MinInfBoundary {
static inline __device__ float bottomRightValue(const float* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const unsigned srcY) {
if (srcX < srcWidth && srcY < srcHeight) {
return src[srcWidth * srcY + srcX];
}
return CUDART_INF_F;
}
static inline __device__ float leftValue(const float* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const unsigned srcY) {
if (srcX >= 0 && srcY < srcHeight) {
return src[srcWidth * srcY + (unsigned)srcX];
}
return CUDART_INF_F;
}
static inline __device__ float topValue(const float* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const int srcY) {
if (srcX < srcWidth && srcY >= 0) {
return src[srcWidth * (unsigned)srcY + srcX];
}
return CUDART_INF_F;
}
static inline __device__ float topLeftValue(const float* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const int srcY) {
if (srcX >= 0 && srcY >= 0) {
return src[srcWidth * (unsigned)srcY + (unsigned)srcX];
}
return CUDART_INF_F;
}
};
/**
* Constant (extend) boundary condition.
*/
template <typename T>
struct ExtendBoundary {
static inline __device__ T bottomRightValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const unsigned srcY) {
const unsigned x = srcX < srcWidth ? srcX : srcWidth - 1;
const unsigned y = srcY < srcHeight ? srcY : srcHeight - 1;
return src[srcWidth * y + x];
}
static inline __device__ T leftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight, const int srcX,
const unsigned srcY) {
const unsigned x = srcX >= 0 ? (unsigned)srcX : 0u;
const unsigned y = srcY < srcHeight ? srcY : srcHeight - 1;
return src[srcWidth * y + x];
}
static inline __device__ T topValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const int srcY) {
const unsigned x = srcX < srcWidth ? srcX : srcWidth - 1;
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
static inline __device__ T topLeftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const int srcY) {
const unsigned x = srcX >= 0 ? (unsigned)srcX : 0u;
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
};
/**
* Wrapping boundary condition.
*/
template <typename T>
struct WrapBoundary {
static inline __device__ T bottomRightValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const unsigned srcY) {
const unsigned x = srcX % srcWidth;
const unsigned y = srcY % srcHeight;
return src[srcWidth * y + x];
}
static inline __device__ T leftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight, const int srcX,
const unsigned srcY) {
const unsigned x = (unsigned)(srcX >= 0 ? srcX : (int)srcWidth + srcX);
const unsigned y = srcY % srcHeight;
return src[srcWidth * y + x];
}
static inline __device__ T topValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const int srcY) {
const unsigned x = srcX % srcWidth;
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
static inline __device__ T topLeftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const int srcY) {
const unsigned x = (unsigned)(srcX >= 0 ? srcX : (int)srcWidth + srcX);
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
};
/**
* Horizontal wrapping boundary condition.
*/
template <typename T>
struct HWrapBoundary {
static inline __device__ T bottomRightValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const unsigned srcY) {
const unsigned x = srcX % srcWidth;
const unsigned y = srcY < srcHeight ? srcY : srcHeight - 1;
return src[srcWidth * y + x];
}
static inline __device__ T leftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight, const int srcX,
const unsigned srcY) {
const unsigned x = (unsigned)(srcX >= 0 ? srcX : (int)srcWidth + srcX);
const unsigned y = srcY < srcHeight ? srcY : srcHeight - 1;
return src[srcWidth * y + x];
}
static inline __device__ T topValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const unsigned srcX, const int srcY) {
const unsigned x = srcX % srcWidth;
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
static inline __device__ T topLeftValue(const T* src, const unsigned srcWidth, const unsigned srcHeight,
const int srcX, const int srcY) {
const unsigned x = (unsigned)(srcX >= 0 ? srcX : (int)srcWidth + srcX);
const unsigned y = srcY >= 0 ? (unsigned)srcY : 0u;
return src[srcWidth * y + x];
}
};
/**
* Load a part of @a src to @a sharedDst. In addition to the core pixels, we will load @a left additional pixels to the
* left (same for @a right, @a top, @a bottom). Outside of the source, pixels values are taken to be the same as the
* border pixels. Pixels of @a sharedDst further than @a left (resp @a right, @a top, @a bottom) of any of the borders
* of src have an undefined value.
* @param sharedDst destination buffer, of size (@a sharedWidth + @a moreLeft + @a moreRight) x (@a sharedHeight + @a
* moreTop + @a moreBottom)
* @param sharedWidth base width of the shared array
* @param sharedHeight base height of the shared array
* @param src source buffer
* @param srcWidth width of the source buffer.
* @param srcHeight width of the source buffer
*
* @a Getter defines how to retrieve values outside of boundaries. See above for options.
*
* WARNING: you need ot call __syncthreads() before reading the shared memory.
* It's not done in this function so that you can do something else in between.
*
* TODO: version that's templated on shared width size for loop unrolling.
*/
template <typename T, unsigned left, unsigned right, unsigned top, unsigned bottom, typename Getter>
inline __device__ void loadToSharedMemory(T* __restrict__ sharedDst, const unsigned sharedWidth,
const unsigned sharedHeight, const T* __restrict__ src,
const unsigned srcWidth, const unsigned srcHeight, const unsigned srcOffsetX,
const unsigned srcOffsetY) {
const unsigned threadId = threadIdx.y * blockDim.x + threadIdx.x;
const unsigned realSharedWidth = sharedWidth + left + right;
// Start with interior pixels.
for (int i = threadId; i < sharedWidth * sharedHeight; i += blockDim.x * blockDim.y) {
const unsigned sharedX = i % sharedWidth;
const unsigned sharedY = i / sharedWidth;
sharedDst[realSharedWidth * top + left + realSharedWidth * sharedY + sharedX] =
Getter::bottomRightValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, srcOffsetY + sharedY);
}
// Top interior pixels
for (int i = threadId; i < sharedWidth * top; i += blockDim.x * blockDim.y) {
const unsigned sharedX = i % sharedWidth;
const unsigned sharedY = i / sharedWidth;
sharedDst[left + realSharedWidth * sharedY + sharedX] =
Getter::topValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, (int)srcOffsetY + (int)sharedY - (int)top);
}
// Left interior pixels
for (int i = threadId; i < sharedHeight * left; i += blockDim.x * blockDim.y) {
const unsigned sharedX = i % left;
const unsigned sharedY = i / left;
sharedDst[realSharedWidth * top + realSharedWidth * sharedY + sharedX] =
Getter::leftValue(src, srcWidth, srcHeight, (int)srcOffsetX + (int)sharedX - (int)left, srcOffsetY + sharedY);
}
// Bottom interior pixels
for (int i = threadId; i < sharedWidth * bottom; i += blockDim.x * blockDim.y) {
const unsigned sharedX = i % sharedWidth;
const unsigned sharedY = sharedHeight + i / sharedWidth;
sharedDst[realSharedWidth * top + left + realSharedWidth * sharedY + sharedX] =
Getter::bottomRightValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, srcOffsetY + sharedY);
}
// Right interior pixels
for (int i = threadId; i < sharedHeight * right; i += blockDim.x * blockDim.y) {
const unsigned sharedX = sharedWidth + i % right;
const unsigned sharedY = i / right;
sharedDst[realSharedWidth * top + left + realSharedWidth * sharedY + sharedX] =
Getter::bottomRightValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, srcOffsetY + sharedY);
}
// NOTE: Hereafter we assume that the block size is enough to load all corner pixels.
// Top-left pixels
if (threadId < left * top) {
// nvcc does not understand that the condition above makes it impossible for 'left' to be zero.
const unsigned sharedX = threadId % left;
const unsigned sharedY = threadId / left;
sharedDst[realSharedWidth * sharedY + sharedX] =
Getter::topLeftValue(src, srcWidth, srcHeight, (int)srcOffsetX + (int)sharedX - (int)left,
(int)srcOffsetY + (int)sharedY - (int)top);
}
// Top-right pixels
if (threadId < right * top) {
// nvcc does not understand that the condition above makes it impossible for 'right' to be zero.
const unsigned sharedX = sharedWidth + threadId % right;
const unsigned sharedY = threadId / right;
sharedDst[left + realSharedWidth * sharedY + sharedX] =
Getter::topValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, (int)srcOffsetY + (int)sharedY - (int)top);
}
// Bottom-left pixels
if (threadId < left * bottom) {
// nvcc does not understand that the condition above makes it impossible for 'left' to be zero.
const unsigned sharedX = threadId % left;
const unsigned sharedY = sharedHeight + threadId / left;
sharedDst[realSharedWidth * top + realSharedWidth * sharedY + sharedX] =
Getter::leftValue(src, srcWidth, srcHeight, (int)srcOffsetX + (int)sharedX - (int)left, srcOffsetY + sharedY);
}
// Bottom-right pixels
if (threadId < right * bottom) {
// nvcc does not understand that the condition above makes it impossible for 'right' to be zero.
const unsigned sharedX = sharedWidth + threadId % right;
const unsigned sharedY = sharedHeight + threadId / right;
sharedDst[left + realSharedWidth * top + realSharedWidth * sharedY + sharedX] =
Getter::bottomRightValue(src, srcWidth, srcHeight, srcOffsetX + sharedX, srcOffsetY + sharedY);
}
}
} // namespace Image
} // namespace VideoStitch
#endif