// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #pragma once #include "backend/common/imageOps.hpp" #include namespace VideoStitch { namespace Image { inline __device__ uint32_t YRGBDiffToRGBA(unsigned char y, const int3& rgbDiff) { const int32_t ya = (1164 * (y - 16)) / 1000; return RGBA::pack(clamp8(ya + rgbDiff.x), clamp8(ya + rgbDiff.y), clamp8(ya + rgbDiff.z), 0xff); } #define nv12_surface_write surface_write_i #include "../gpuKernelDef.h" #include "backend/common/image/unpack.gpu" // ---------------------------- Output ----------------------------- __global__ void unpackKernelGrayscale(unsigned char* dst, unsigned pitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { uint32_t val; surf2Dread(&val, src, x * sizeof(uint32_t), y); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); dst[y * pitch + x] = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; } } __global__ void unpackSourceKernelRGBA(uint32_t* dst, unsigned pitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { // yeah, we could use a memcpy uint32_t val; surf2Dread(&val, src, x * sizeof(uint32_t), y); dst[y * pitch + x] = val; } } __global__ void unpackKernelRGB(unsigned char* __restrict__ dst, unsigned pitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { const uint32_t val = src[y * width + x]; dst[y * pitch + 3 * x] = RGBA::r(val); dst[y * pitch + 3 * x + 1] = RGBA::g(val); dst[y * pitch + 3 * x + 2] = RGBA::b(val); } } __global__ void unpackSourceKernelRGB(unsigned char* dst, unsigned pitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { uint32_t val; surf2Dread(&val, src, x * sizeof(uint32_t), y); dst[y * pitch + 3 * x] = RGBA::r(val); dst[y * pitch + 3 * x + 1] = RGBA::g(val); dst[y * pitch + 3 * x + 2] = RGBA::b(val); } } __global__ void unpackSourceKernelF32C1(float* dst, unsigned pitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { // yeah, we could use a memcpy float val; surf2Dread(&val, src, x * sizeof(float), y); dst[y * pitch + x] = val; } } __global__ void unpackSourceKernelGrayscale16(uint16_t* dst, unsigned pitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { const unsigned x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { float val; surf2Dread(&val, src, x * sizeof(float), y); const float inMilliMeters = val * 1000.f; const uint16_t u16 = (uint16_t)max(0.f, min((float)USHRT_MAX, round(inMilliMeters))); dst[y * pitch + x] = u16; } } __global__ void unpackKernelDepth(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uDst, unsigned uPitch, unsigned char* __restrict__ vDst, unsigned vPitch, const float* __restrict__ src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned y = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (x < width && y < height) { int32_t u = 0; int32_t v = 0; #pragma unroll for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { const float depth = src[(y + j) * width + x + i]; // convert to millimeters and truncate unsigned int val = min(__float2uint_rn(depth * 1000.f), 65279); // encode yDst[(y + j) * yPitch + x + i] = (unsigned char)(val / 256); int cu = val % 512; int cv = (val + 384) % 512; if (cu >= 256) { u += (unsigned char)(511 - cu); } else { u += (unsigned char)cu; } if (cv >= 256) { v += (unsigned char)(511 - cv); } else { v += (unsigned char)cv; } } } uDst[(y * uPitch + x) / 2] = (u + 2) / 4; vDst[(y * vPitch + x) / 2] = (v + 2) / 4; } } __global__ void unpackSourceKernelDepth(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uDst, unsigned uPitch, unsigned char* __restrict__ vDst, unsigned vPitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned y = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (x < width && y < height) { int32_t u = 0; int32_t v = 0; #pragma unroll for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { float depth; surf2Dread(&depth, src, (x + i) * sizeof(float), y + j); // convert to millimeters and truncate unsigned int val = min(__float2uint_rn(depth * 1000.f), 65279); // encode yDst[(y + j) * yPitch + x + i] = (unsigned char)(val / 256); int cu = val % 512; int cv = (val + 384) % 512; if (cu >= 256) { u += (unsigned char)(511 - cu); } else { u += (unsigned char)cu; } if (cv >= 256) { v += (unsigned char)(511 - cv); } else { v += (unsigned char)cv; } } } uDst[(y * uPitch + x) / 2] = (u + 2) / 4; vDst[(y * vPitch + x) / 2] = (v + 2) / 4; } } /** * This kernel converts the buffer from RGBA to planar 12 bits 4:2:0 (YV12) out-of-place. * The conversion is undefined for pixels with 0 alpha. * * Y0 Y1 Y2 Y3 * ... * U0 U1 * ... * V0 V1 * ... */ __global__ void unpackKernelYV12(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uDst, unsigned uPitch, unsigned char* __restrict__ vDst, unsigned vPitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned sy = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (sx < width && sy < height) { int32_t u = 0; int32_t v = 0; { uint32_t val = src[sy * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { uint32_t val = src[sy * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val = src[(sy + 1) * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val = src[(sy + 1) * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uDst[(sy * uPitch + sx) / 2] = u / 4; vDst[(sy * vPitch + sx) / 2] = v / 4; } else { // border case with odd width / height if (sx + 1 < width) { uint32_t val = src[sy * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = u / 2; vDst[(sy * vPitch + sx) / 2] = v / 2; } __syncthreads(); if (sy + 1 < height) { uint32_t val = src[(sy + 1) * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = u / 2; vDst[(sy * vPitch + sx) / 2] = v / 2; } } } } __global__ void unpackSourceKernelYV12(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uDst, unsigned uPitch, unsigned char* __restrict__ vDst, unsigned vPitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned sy = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (sx < width && sy < height) { int32_t u = 0; int32_t v = 0; { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy + 1); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy + 1); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uDst[(sy * uPitch + sx) / 2] = u / 4; vDst[(sy * vPitch + sx) / 2] = v / 4; } else { // border case with odd width / height if (sx + 1 < width) { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = u / 2; vDst[(sy * vPitch + sx) / 2] = v / 2; } __syncthreads(); if (sy + 1 < height) { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy + 1); int32_t r = clamp8(RGBA::r(val)); int32_t g = clamp8(RGBA::g(val)); int32_t b = clamp8(RGBA::b(val)); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = u / 2; vDst[(sy * vPitch + sx) / 2] = v / 2; } } } } /** * This kernel converts the buffer from RGBA to interleaved YUV420 (NV12) out-of-place. * The conversion is undefined for pixels with 0 alpha. * * Y0 Y1 Y2 Y3 * ... * U0 V0 U1 V1 * ... */ __global__ void unpackKernelNV12(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uvDst, unsigned uvPitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned sy = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (sx < width && sy < height) { int32_t u = 0; int32_t v = 0; { uint32_t val = src[sy * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { uint32_t val = src[sy * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val = src[(sy + 1) * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val = src[(sy + 1) * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uvDst[(sy * uvPitch) / 2 + sx] = u / 4; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 4; } else { // border case with odd width / height if (sx + 1 < width) { uint32_t val = src[sy * width + sx + 1]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * uvPitch) / 2 + sx] = u / 2; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 2; } else if (sy + 1 < height) { uint32_t val = src[(sy + 1) * width + sx]; int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * uvPitch) / 2 + sx] = u / 2; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 2; } } } } /** * This kernel converts the buffer from RGBA to interleaved YUV420 (NV12) out-of-place. * The conversion is undefined for pixels with 0 alpha. * * Y0 Y1 Y2 Y3 * ... * U0 V0 U1 V1 * ... */ __global__ void unpackSourceKernelNV12(unsigned char* __restrict__ yDst, unsigned yPitch, unsigned char* __restrict__ uvDst, unsigned uvPitch, const cudaSurfaceObject_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (blockIdx.x * blockDim.x + threadIdx.x); unsigned sy = 2 * (blockIdx.y * blockDim.y + threadIdx.y); if (sx < width && sy < height) { int32_t u = 0; int32_t v = 0; { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy + 1); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy + 1); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uvDst[(sy * uvPitch) / 2 + sx] = u / 4; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 4; } else { // border case with odd width / height if (sx + 1 < width) { uint32_t val; surf2Dread(&val, src, (sx + 1) * sizeof(uint32_t), sy); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * uvPitch) / 2 + sx] = u / 2; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 2; } else if (sy + 1 < height) { uint32_t val; surf2Dread(&val, src, sx * sizeof(uint32_t), sy + 1); int32_t r = RGBA::r(val); int32_t g = RGBA::g(val); int32_t b = RGBA::b(val); int32_t y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * uvPitch) / 2 + sx] = u / 2; uvDst[(sy * uvPitch) / 2 + sx + 1] = v / 2; } } } } /** * This kernel converts the buffer from RGBA to YUY2 out-of-place. * Pixels are all given full solidness (max alpha). */ __global__ void unpackYUY2Kernel(unsigned char* __restrict__ dst, unsigned pitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width / 2 && y < height) { uint32_t val0 = src[y * width + 2 * x]; int32_t r0 = RGBA::r(val0); int32_t g0 = RGBA::g(val0); int32_t b0 = RGBA::b(val0); uint32_t val1 = src[y * width + 2 * x + 1]; int32_t r1 = RGBA::r(val1); int32_t g1 = RGBA::g(val1); int32_t b1 = RGBA::b(val1); unsigned char y0 = ((66 * r0 + 129 * g0 + 25 * b0 + 128) >> 8) + 16; unsigned char y1 = ((66 * r1 + 129 * g1 + 25 * b1 + 128) >> 8) + 16; unsigned char u = ((-38 * r0 - 74 * g0 + 112 * b0 + 128) >> 8) + 128; unsigned char v = ((112 * r0 - 94 * g0 - 18 * b0 + 128) >> 8) + 128; dst[y * pitch + 4 * x] = y0; dst[y * pitch + 4 * x + 1] = u; dst[y * pitch + 4 * x + 2] = y1; dst[y * pitch + 4 * x + 3] = v; } } /** * This kernel converts the buffer from RGBA to UYVY out-of-place. * Pixels are all given full solidness (max alpha). */ __global__ void unpackUYVYKernel(unsigned char* __restrict__ dst, unsigned pitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width / 2 && y < height) { uint32_t val0 = src[y * width + 2 * x]; int32_t r0 = RGBA::r(val0); int32_t g0 = RGBA::g(val0); int32_t b0 = RGBA::b(val0); uint32_t val1 = src[y * width + 2 * x + 1]; int32_t r1 = RGBA::r(val1); int32_t g1 = RGBA::g(val1); int32_t b1 = RGBA::b(val1); unsigned char y0 = ((66 * r0 + 129 * g0 + 25 * b0 + 128) >> 8) + 16; unsigned char y1 = ((66 * r1 + 129 * g1 + 25 * b1 + 128) >> 8) + 16; unsigned char u = ((-38 * r0 - 74 * g0 + 112 * b0 + 128) >> 8) + 128; unsigned char v = ((112 * r0 - 94 * g0 - 18 * b0 + 128) >> 8) + 128; dst[y * pitch + 4 * x] = u; dst[y * pitch + 4 * x + 1] = y0; dst[y * pitch + 4 * x + 2] = v; dst[y * pitch + 4 * x + 3] = y1; } } /** * This kernel converts the buffer from RGBA to 10 bits planar YUV422 out-of-place. * Pixels are all given full solidness (max alpha). * 10 bits values are padded to 16 bits. */ __global__ void unpackYUV422P10Kernel(uint16_t* __restrict__ yDst, unsigned yPitch, uint16_t* __restrict__ uDst, unsigned uPitch, uint16_t* __restrict__ vDst, unsigned vPitch, const uint32_t* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width / 2 && y < height) { uint32_t val0 = src[y * width + 2 * x]; int32_t r0 = RGBA::r(val0); int32_t g0 = RGBA::g(val0); int32_t b0 = RGBA::b(val0); uint32_t val1 = src[y * width + 2 * x + 1]; int32_t r1 = RGBA::r(val1); int32_t g1 = RGBA::g(val1); int32_t b1 = RGBA::b(val1); uint32_t u = 0, v = 0; int32_t y0 = ((66 * r0 + 129 * g0 + 25 * b0 + 128) >> 8) + 16 << 2; int32_t y1 = ((66 * r1 + 129 * g1 + 25 * b1 + 128) >> 8) + 16 << 2; u += ((-38 * r0 - 74 * g0 + 112 * b0 + 128) >> 8) + 128 << 2; u += ((-38 * r1 - 74 * g1 + 112 * b1 + 128) >> 8) + 128 << 2; v += ((112 * r0 - 94 * g0 - 18 * b0 + 128) >> 8) + 128 << 2; v += ((112 * r1 - 94 * g1 - 18 * b1 + 128) >> 8) + 128 << 2; yDst[y * yPitch + 2 * x] = y0; yDst[y * yPitch + 2 * x + 1] = y1; uDst[y * uPitch + x] = u / 2; vDst[y * vPitch + x] = v / 2; } } __global__ void unpackMonoKernelYUV420P(unsigned char* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = blockIdx.x * blockDim.x + threadIdx.x; unsigned sy = blockIdx.y * blockDim.y + threadIdx.y; if (sx < width / 2 && sy < height / 2) { { const unsigned i = (2 * sy) * width + 2 * sx; dst[i] = src[i]; } { const unsigned i = (2 * sy) * width + 2 * sx + 1; dst[i] = src[i]; } { const unsigned i = (2 * sy + 1) * width + 2 * sx; dst[i] = src[i]; } { const unsigned i = (2 * sy + 1) * width + 2 * sx + 1; dst[i] = src[i]; } } } // ---------------------------- Input ----------------------------- /** * This kernel converts the buffer from BGRU8888 (where 'U' stands for 'unused') to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha) */ __global__ void convertBGRUToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { unsigned i = y * width + x; dst[i] = RGBA::pack(src[4 * i + 2], src[4 * i + 1], src[4 * i], 0xff); } } /** * This kernel converts the buffer from RGB to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha) */ __global__ void convertRGBToRGBAKernel(cudaSurfaceObject_t dst, const unsigned char* src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { unsigned i = y * width + x; surf2Dwrite(RGBA::pack(src[3 * i], src[3 * i + 1], src[3 * i + 2], 0xff), dst, x * sizeof(uint32_t), y); } } __global__ void convertRGB210ToRGBAKernel(cudaSurfaceObject_t dst, const uint32_t* src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { uint32_t v = src[y * width + x]; surf2Dwrite(RGBA::pack(clamp8(RGB210::r(v)), clamp8(RGB210::g(v)), clamp8(RGB210::b(v)), RGB210::a(v)), dst, x * sizeof(uint32_t), y); } } __device__ unsigned loadBayerPattern(const unsigned char* __restrict__ src, unsigned width, unsigned height, unsigned char* sharedSrc, unsigned srcX, unsigned srcY) { // The shared memory uses the same pattern as src. // There are (2 * blockDim.x + 2) * (2 * blockDim.y + 2) bytes to load // (we need an extra layer outside the current zone for interpolation). const unsigned sharedBlockWidth = blockDim.x + 1; // +1: one half block left and one half-block right. const unsigned sharedWidth = 2 * sharedBlockWidth; // Start with interior blocks. if (srcX < width && srcY < height) { // The access pattern is the same as during interpolation, meaning that each thread issues two (coalesced) reads: RG // then GB. // TODO: try out a different loading pattern: each thread loads 4 consecutive bytes in memory. // This would reduce the number of coalesced reads to 1 instead of 2. // Note that this would not be the same pattern as during interpolation. const int srcBase = width * srcY + srcX; const int sharedBase = sharedWidth + 1 + 2 * (sharedWidth * threadIdx.y + threadIdx.x); // The compiler should be able to optimize that in only 2 coalesced single word reads. // If it can't, accesses are still coalesced, but there are 4 accesses instead of one. sharedSrc[sharedBase] = src[srcBase]; sharedSrc[sharedBase + 1] = src[srcBase + 1]; sharedSrc[sharedBase + sharedWidth] = src[srcBase + width]; sharedSrc[sharedBase + sharedWidth + 1] = src[srcBase + width + 1]; } // Now load the boundary if (threadIdx.y == 0 && srcX < width) { // Top { const int sharedBase = 1 + 2 * threadIdx.x; const int srcBase = srcY > 0 ? // Normal case. width * (srcY - 1) + srcX : // The previous row is outside the image, constant boundary condition. width + srcX; sharedSrc[sharedBase] = src[srcBase]; sharedSrc[sharedBase + 1] = src[srcBase + 1]; } // Bottom { int srcBoundaryRow; int sharedBoundaryRow; if (srcY + 2 * blockDim.y < height) { // Normal case, extra row is within image. srcBoundaryRow = srcY + 2 * blockDim.y; sharedBoundaryRow = 2 * blockDim.y; } else { // The next row is outside the image, constant boundary condition. srcBoundaryRow = height - 2; sharedBoundaryRow = height - srcY; } const int srcBase = width * srcBoundaryRow + srcX; const int sharedBase = sharedWidth + 1 + sharedWidth * sharedBoundaryRow + 2 * threadIdx.x; sharedSrc[sharedBase] = src[srcBase]; sharedSrc[sharedBase + 1] = src[srcBase + 1]; } } if (threadIdx.x == 0 && srcY < height) { // Left { const int sharedBase = sharedWidth + 2 * sharedWidth * threadIdx.y; const int srcBase = srcX > 0 ? // Normal case. width * srcY + srcX - 1 : // The previous col is outside the image, constant boundary condition. width * srcY + 1; sharedSrc[sharedBase] = src[srcBase]; sharedSrc[sharedBase + sharedWidth] = src[srcBase + width]; } // Right { int srcBoundaryCol; int sharedBoundaryCol; if (srcX + 2 * blockDim.x < width) { // Normal case, extra col is within image. srcBoundaryCol = srcX + 2 * blockDim.x; sharedBoundaryCol = 2 * blockDim.x; } else { // The next col is outside the image, constant boundary condition. srcBoundaryCol = width - 2; sharedBoundaryCol = width - srcX; } const int srcBase = width * srcY + srcBoundaryCol; const int sharedBase = sharedWidth + 1 + 2 * sharedWidth * threadIdx.y + sharedBoundaryCol; sharedSrc[sharedBase] = src[srcBase]; sharedSrc[sharedBase + sharedWidth] = src[srcBase + width]; } } // And the corners. if (threadIdx.x == 0 && threadIdx.y == 0) { // Due to the assymetry, only the top left and bottom right corner are ever used (see the test for an example). // Top left { const int srcBoundaryCol = srcX > 0 ? srcX - 1 : 1; const int srcBoundaryRow = srcY > 0 ? srcY - 1 : 1; const int srcBase = width * srcBoundaryRow + srcBoundaryCol; sharedSrc[0] = src[srcBase]; } // Bottom right. { int srcBoundaryCol; int sharedBoundaryCol; if (srcX + 2 * blockDim.x < width) { // Normal case, extra col is within image. srcBoundaryCol = srcX + 2 * blockDim.x; sharedBoundaryCol = 2 * blockDim.x; } else { // The next col is outside the image, constant boundary condition. srcBoundaryCol = width - 2; sharedBoundaryCol = width - srcX; } int srcBoundaryRow; int sharedBoundaryRow; if (srcY + 2 * blockDim.y < height) { // Normal case, extra row is within image. srcBoundaryRow = srcY + 2 * blockDim.y; sharedBoundaryRow = 2 * blockDim.y; } else { // The next row is outside the image, constant boundary condition. srcBoundaryRow = height - 2; sharedBoundaryRow = height - srcY; } const int srcBase = width * srcBoundaryRow + srcBoundaryCol; const int sharedBase = sharedWidth + 1 + sharedWidth * sharedBoundaryRow + sharedBoundaryCol; sharedSrc[sharedBase] = src[srcBase]; } } return sharedWidth; } /** * This kernel converts the buffer from Bayer-filtered RGGB to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha). * Uses bilinear interpolation within color planes. * * Each thread handles a 2*2 RGGB pixel block. The interpolation support is the 4*4 pixel block centered around the 2*2 * block. Globally each thread block needs an extra pixel around itself. * */ __global__ void convertBayerRGGBToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // x and y are the 2*2 block ids. const unsigned srcX = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned srcY = 2 * (blockIdx.y * blockDim.y + threadIdx.y); // Load the data to shared memory. extern __shared__ unsigned char sharedSrc[]; const unsigned sharedWidth = loadBayerPattern(src, width, height, sharedSrc, srcX, srcY); __syncthreads(); if (srcX < width && srcY < height) { const int sharedBase = sharedWidth + 1 + 2 * (sharedWidth * threadIdx.y + threadIdx.x); // Top-left component; dst[srcY * width + srcX] = RGBA::pack( sharedSrc[sharedBase], // Red is given ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth]) / 4, // Green is 4-tap straight (+) ((int32_t)sharedSrc[sharedBase - 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase - 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 4, // Blue is 4-tap 45° rotated (x) 255); // Top-right component; dst[srcY * width + srcX + 1] = RGBA::pack( ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2]) / 2, // Red is 2-tap horizontal sharedSrc[sharedBase + 1], // Green is given ((int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 2, // Blue is 2-tap vertical 255); // Bottom-left component; dst[(srcY + 1) * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth]) / 2, // Red is 2-tap vertical sharedSrc[sharedBase + sharedWidth], // Green is given ((int32_t)sharedSrc[sharedBase + sharedWidth - 1] + (int32_t)sharedSrc[sharedBase + sharedWidth + 1]) / 2, // Blue is 2-tap horizontal 255); // Bottom-right component dst[(srcY + 1) * width + srcX + 1] = RGBA::pack( ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + 2 * sharedWidth]) / 4, // Red is is 4-tap 45° rotated (x) ((int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 4, // Green is 4-tap straight (+) sharedSrc[sharedBase + sharedWidth + 1], // Blue is given 255); } } /** * This kernel converts the buffer from Bayer-filtered BGGR to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha). * Uses bilinear interpolation within color planes. * * Each thread handles a 2*2 BGGR pixel block. The interpolation support is the 4*4 pixel block centered around the 2*2 * block. Globally each thread block needs an extra pixel around itself. * */ __global__ void convertBayerBGGRToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // x and y are the 2*2 block ids. const unsigned srcX = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned srcY = 2 * (blockIdx.y * blockDim.y + threadIdx.y); // Load the data to shared memory. extern __shared__ unsigned char sharedSrc[]; const unsigned sharedWidth = loadBayerPattern(src, width, height, sharedSrc, srcX, srcY); __syncthreads(); if (srcX < width && srcY < height) { const int sharedBase = sharedWidth + 1 + 2 * (sharedWidth * threadIdx.y + threadIdx.x); // Top-left component; dst[srcY * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase - 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase - 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 4, // Red is 4-tap 45° rotated (x) ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth]) / 4, // Green is 4-tap straight (+) sharedSrc[sharedBase], // Blue is given 255); // Top-right component; dst[srcY * width + srcX + 1] = RGBA::pack( ((int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 2, // Red is 2-tap vertical sharedSrc[sharedBase + 1], // Green is given ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2]) / 2, // Blue is 2-tap horizontal 255); // Bottom-left component; dst[(srcY + 1) * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase + sharedWidth - 1] + (int32_t)sharedSrc[sharedBase + sharedWidth + 1]) / 2, // Red is 2-tap horizontal sharedSrc[sharedBase + sharedWidth], // Green is given ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth]) / 2, // Blue is 2-tap vertical 255); // Bottom-right component dst[(srcY + 1) * width + srcX + 1] = RGBA::pack( sharedSrc[sharedBase + sharedWidth + 1], // Red is given ((int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 4, // Green is 4-tap straight (+) ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + 2 * sharedWidth]) / 4, // Blue is is 4-tap 45° rotated (x) 255); } } /** * This kernel converts the buffer from Bayer-filtered GRBG to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha). * Uses bilinear interpolation within color planes. * * Each thread handles a 2*2 GRBG pixel block. The interpolation support is the 4*4 pixel block centered around the 2*2 * block. Globally each thread block needs an extra pixel around itself. * */ __global__ void convertBayerGRBGToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // x and y are the 2*2 block ids. const unsigned srcX = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned srcY = 2 * (blockIdx.y * blockDim.y + threadIdx.y); // Load the data to shared memory. extern __shared__ unsigned char sharedSrc[]; const unsigned sharedWidth = loadBayerPattern(src, width, height, sharedSrc, srcX, srcY); __syncthreads(); if (srcX < width && srcY < height) { const int sharedBase = sharedWidth + 1 + 2 * (sharedWidth * threadIdx.y + threadIdx.x); // Top-left component; dst[srcY * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1]) / 2, // Red is 2-tap horizontal sharedSrc[sharedBase], // Green is given ((int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth]) / 2, // Blue is 2-tap vertical 255); // Top-right component; dst[srcY * width + srcX + 1] = RGBA::pack( sharedSrc[sharedBase + 1], // Red is given ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2] + (int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 4, // Green is 4-tap straight (+) ((int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth]) / 4, // Blue is 4-tap 45° rotated (x) 255); // Bottom-left component; dst[(srcY + 1) * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase - 1 + 2 * sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 4, // Red is 4-tap 45° rotated (x) ((int32_t)sharedSrc[sharedBase - 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth]) / 4, // Green is 4-tap straight (+) sharedSrc[sharedBase + sharedWidth], // Blue is given 255); // Bottom-right component dst[(srcY + 1) * width + srcX + 1] = RGBA::pack(((int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 2, // Red is 2-tap vertical sharedSrc[sharedBase + sharedWidth + 1], // Green is given ((int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth]) / 2, // Blue is 2-tap horizontal 255); } } /** * This kernel converts the buffer from Bayer-filtered GBRG to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha). * Uses bilinear interpolation within color planes. * * Each thread handles a 2*2 GBRG pixel block. The interpolation support is the 4*4 pixel block centered around the 2*2 * block. Globally each thread block needs an extra pixel around itself. * */ __global__ void convertBayerGBRGToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // x and y are the 2*2 block ids. const unsigned srcX = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned srcY = 2 * (blockIdx.y * blockDim.y + threadIdx.y); // Load the data to shared memory. extern __shared__ unsigned char sharedSrc[]; const unsigned sharedWidth = loadBayerPattern(src, width, height, sharedSrc, srcX, srcY); __syncthreads(); if (srcX < width && srcY < height) { const int sharedBase = sharedWidth + 1 + 2 * (sharedWidth * threadIdx.y + threadIdx.x); // Top-left component; dst[srcY * width + srcX] = RGBA::pack( ((int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth]) / 2, // Red is 2-tap vertical sharedSrc[sharedBase], // Green is given ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1]) / 2, // Blue is 2-tap horizontal 255); // Top-right component; dst[srcY * width + srcX + 1] = RGBA::pack( ((int32_t)sharedSrc[sharedBase - sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 - sharedWidth] + (int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth]) / 4, // Red is 4-tap 45° rotated (x) ((int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2] + (int32_t)sharedSrc[sharedBase + 1 - sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth]) / 4, // Green is 4-tap straight (+) sharedSrc[sharedBase + 1], // Blue is given 255); // Bottom-left component; dst[(srcY + 1) * width + srcX] = RGBA::pack( sharedSrc[sharedBase + sharedWidth], // Red is given ((int32_t)sharedSrc[sharedBase - 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + sharedWidth] + (int32_t)sharedSrc[sharedBase] + (int32_t)sharedSrc[sharedBase + 2 * sharedWidth]) / 4, // Green is 4-tap straight (+) ((int32_t)sharedSrc[sharedBase - 1] + (int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase - 1 + 2 * sharedWidth] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 4, // Blue is 4-tap 45° rotated (x) 255); // Bottom-right component dst[(srcY + 1) * width + srcX + 1] = RGBA::pack(((int32_t)sharedSrc[sharedBase + sharedWidth] + (int32_t)sharedSrc[sharedBase + 2 + sharedWidth]) / 2, // Red is 2-tap horizontal sharedSrc[sharedBase + sharedWidth + 1], // Green is given ((int32_t)sharedSrc[sharedBase + 1] + (int32_t)sharedSrc[sharedBase + 1 + 2 * sharedWidth]) / 2, // Blue is 2-tap vertical 255); } } /** * This kernel converts the buffer from BGR888 to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha) */ __global__ void convertBGRToRGBAKernel(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { unsigned i = y * width + x; dst[i] = RGBA::pack(src[3 * i + 2], src[3 * i + 1], src[3 * i], 0xff); } } /** * These kernels converts the buffer from various YUV422 representations to RGBA8888 out-of-place. * Pixels are all given full solidness (max alpha). */ __global__ void convertUYVYToRGBAKernel(cudaSurfaceObject_t dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { // each thread is responsible for a 2x1 pixel group // Two bytes per pixel. Y0 U Y1 V // Read 2x (y0), 2x+1 (u), 2x+2 (y1) 2x+3 (v) // Write x, x+1 // Repeat for every line const unsigned pitch = width * 2; const unsigned x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { const unsigned char u = src[y * pitch + 2 * x]; // Two bytes per pixel. U Y0 V Y1 const unsigned char y0 = src[y * pitch + 2 * x + 1]; const unsigned char v = src[y * pitch + 2 * x + 2]; const unsigned char y1 = src[y * pitch + 2 * x + 3]; const RGBDiff rgbDiff(yuv444ToRGBDiff(u, v)); surf2Dwrite(YRGBDiffToRGBA(y0, rgbDiff), dst, x * 4, y); surf2Dwrite(YRGBDiffToRGBA(y1, rgbDiff), dst, (x + 1) * 4, y); } } __global__ void convertYUY2ToRGBAKernel(cudaSurfaceObject_t dst, const unsigned char* src, unsigned width, unsigned height) { // each thread is responsible for a 2x1 pixel group // Two bytes per pixel. Y0 U Y1 V // Read 2x (y0), 2x+1 (u), 2x+2 (y1) 2x+3 (v) // Write x, x+1 // Repeat for every line const unsigned pitch = width * 2; const unsigned x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); const unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { const unsigned char y0 = src[y * pitch + 2 * x]; // Two bytes per pixel. Y0 U Y1 V const unsigned char u = src[y * pitch + 2 * x + 1]; const unsigned char y1 = src[y * pitch + 2 * x + 2]; const unsigned char v = src[y * pitch + 2 * x + 3]; const RGBDiff rgbDiff(yuv444ToRGBDiff(u, v)); surf2Dwrite(YRGBDiffToRGBA(y0, rgbDiff), dst, x * 4, y); surf2Dwrite(YRGBDiffToRGBA(y1, rgbDiff), dst, (x + 1) * 4, y); } } /** * This kernel converts the buffer from 10 bits planar YUV422 to packed RGBA8888 out-of-place. * Each thread manages 2 pixels. * 10 bits values are padded to 16 bits, and are clamped to 8 bits during conversion * All pixels are solid. */ __global__ void convertYUV422P10ToRGBAKernel(cudaSurfaceObject_t dst, const uint16_t* src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; const uint16_t* uSrc = src + width * height; const uint16_t* vSrc = uSrc + width * height / 2; if (x < width / 2 && y < height) { uint32_t y0 = src[y * width + 2 * x] >> 2; uint32_t y1 = src[y * width + 2 * x + 1] >> 2; uint32_t u = uSrc[y * (width / 2) + x] >> 2; uint32_t v = vSrc[y * (width / 2) + x] >> 2; const RGBDiff rgbDiff = yuv444ToRGBDiff(u, v); surf2Dwrite(YRGBDiffToRGBA(y0, rgbDiff), dst, (2 * x) * 4, y); surf2Dwrite(YRGBDiffToRGBA(y1, rgbDiff), dst, (2 * x + 1) * 4, y); } } /** * This kernel converts the buffer from planar 12 bits 4:2:0 (YV12) to packed RGBA8888 out-of-place. * All pixels are solid. */ __global__ void convertYV12ToRGBAKernel(cudaSurfaceObject_t dst, const unsigned char* src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = blockIdx.x * blockDim.x + threadIdx.x; unsigned sy = blockIdx.y * blockDim.y + threadIdx.y; const unsigned char* uSrc = src + width * height; const unsigned char* vSrc = uSrc + (width * height) / 4; if (sx < width / 2 && sy < height / 2) { const RGBDiff rgbDiff(yuv444ToRGBDiff(uSrc[sy * (width / 2) + sx], vSrc[sy * (width / 2) + sx])); surf2Dwrite(YRGBDiffToRGBA(src[(2 * sy) * width + 2 * sx], rgbDiff), dst, (2 * sx) * 4, 2 * sy); surf2Dwrite(YRGBDiffToRGBA(src[(2 * sy) * width + 2 * sx + 1], rgbDiff), dst, (2 * sx + 1) * 4, 2 * sy); surf2Dwrite(YRGBDiffToRGBA(src[(2 * sy + 1) * width + 2 * sx], rgbDiff), dst, (2 * sx) * 4, 2 * sy + 1); surf2Dwrite(YRGBDiffToRGBA(src[(2 * sy + 1) * width + 2 * sx + 1], rgbDiff), dst, (2 * sx + 1) * 4, 2 * sy + 1); } } __global__ void convertKernelGrayscale(uint32_t* __restrict__ dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { unsigned i = y * width + x; dst[i] = RGBA::pack((uint32_t)src[i], (uint32_t)src[i], (uint32_t)src[i], 0xff); } } __global__ void convertGrayscaleKernel(cudaSurfaceObject_t dst, const unsigned char* __restrict__ src, unsigned width, unsigned height) { unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < height) { unsigned i = y * width + x; surf2Dwrite(RGBA::pack((uint32_t)src[i], (uint32_t)src[i], (uint32_t)src[i], 0xff), dst, x * 4, y); } } } // namespace Image } // namespace VideoStitch