// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm __global__ void downsampleMonoKernel(global_mem unsigned char* dst, const global_mem unsigned char* __restrict__ src, unsigned srcWidth, unsigned srcHeight, unsigned factor) { const unsigned dstX = get_global_id_x(); const unsigned dstY = get_global_id_y(); const unsigned srcX = dstX * factor; const unsigned srcY = dstY * factor; if (srcX < srcWidth && srcY < srcHeight) { int32_t v = 0; for (unsigned y = 0; y < factor; ++y) { for (unsigned x = 0; x < factor; ++x) { v += src[(srcY + y) * srcWidth + (srcX + x)]; } } const int div = factor * factor; dst[dstY * srcWidth / factor + dstX] = (unsigned char)(v / div); } } __global__ void downsampleRGBASurfKernel(surface_t dst, const read_only image2d_t src, unsigned dstWidth, unsigned dstHeight) { const unsigned x = get_global_id_x(); const unsigned y = get_global_id_y(); if (x < dstWidth && y < dstHeight) { float4 val = read_texture_vs(src, make_float2(x * 2.0f + 1.0f, y * 2.0f + 1.0f)); surface_write_f(val, dst, x, y); } } __global__ void downsampleRGBAKernel(global_mem unsigned char* __restrict__ dst, unsigned dstPitch, const global_mem unsigned char* __restrict__ src, unsigned srcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { const unsigned dstX = get_global_id_x(); const unsigned dstY = get_global_id_y(); const unsigned srcX = dstX * factor; const unsigned srcY = dstY * factor; if (srcX < srcWidth && srcY < srcHeight) { int32_t r = 0; int32_t g = 0; int32_t b = 0; int32_t a = 0; for (unsigned y = 0; y < factor; ++y) { for (unsigned x = 0; x < factor; ++x) { unsigned offset = (srcY + y) * srcPitch + 4 * (srcX + x); r += src[offset]; g += src[offset + 1]; b += src[offset + 2]; a += src[offset + 3]; } } const int div = factor * factor; dst[dstY * dstPitch + 4 * dstX] = (unsigned char)(r / div); dst[dstY * dstPitch + 4 * dstX + 1] = (unsigned char)(g / div); dst[dstY * dstPitch + 4 * dstX + 2] = (unsigned char)(b / div); unsigned char alpha = 0; // !!dst (always == 1 if this kernel runs) used to promote the constant 4 * 255 to a variable to work around Intel // compiler error see confluence "OpenCL runtime compilation failure cases" if (a == 4 * 255 * !!dst) { alpha = 255; } dst[dstY * dstPitch + 4 * dstX + 3] = alpha; } } __global__ void downsampleYUV422Kernel(global_mem unsigned char* __restrict__ dst, unsigned dstPitch, const global_mem unsigned char* __restrict__ src, unsigned srcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { // each thread accumulates for 2 (horizontally) consecutive destination pixels // coords are in pixels const unsigned dstX = 2 * get_global_id_x(); const unsigned dstY = get_global_id_y(); const unsigned srcX = dstX * factor; const unsigned srcY = dstY * factor; // width and height are in bytes, here 2 bytes / pixel if (srcX < srcWidth && srcY < srcHeight) { int32_t u = 0; int32_t y0 = 0; int32_t v = 0; int32_t y1 = 0; for (unsigned y = 0; y < factor; ++y) { for (unsigned x = 0; x < factor / 2; ++x) { unsigned offset = (srcY + y) * srcPitch + 2 * (srcX + 2 * x); u += src[offset]; y0 += src[offset + 1]; v += src[offset + 2]; y0 += src[offset + 3]; } for (unsigned x = factor / 2; x < factor; ++x) { unsigned offset = (srcY + y) * srcPitch + 2 * (srcX + 2 * x); u += src[offset]; y1 += src[offset + 1]; v += src[offset + 2]; y1 += src[offset + 3]; } } const int div = factor * factor; dst[dstY * dstPitch + 2 * dstX] = (unsigned char)(u / div); dst[dstY * dstPitch + 2 * dstX + 1] = (unsigned char)(y0 / div); dst[dstY * dstPitch + 2 * dstX + 2] = (unsigned char)(v / div); dst[dstY * dstPitch + 2 * dstX + 3] = (unsigned char)(y1 / div); } } // This one is pretty bad because accesses are not aligned on word boundaries. __global__ void downsampleRGBKernel(global_mem unsigned char* __restrict__ dst, unsigned dstPitch, const global_mem unsigned char* __restrict__ src, unsigned srcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { const unsigned dstX = get_global_id_x(); const unsigned dstY = get_global_id_y(); const unsigned srcX = dstX * factor; const unsigned srcY = dstY * factor; if (srcX < srcWidth && srcY < srcHeight) { int32_t r = 0; int32_t g = 0; int32_t b = 0; for (unsigned y = 0; y < factor; ++y) { for (unsigned x = 0; x < factor; ++x) { unsigned offset = (srcY + y) * srcPitch + 3 * (srcX + x); r += src[offset]; g += src[offset + 1]; b += src[offset + 2]; } } const int div = factor * factor; dst[dstY * dstPitch + 3 * dstX] = (unsigned char)(r / div); dst[dstY * dstPitch + 3 * dstX + 1] = (unsigned char)(g / div); dst[dstY * dstPitch + 3 * dstX + 2] = (unsigned char)(b / div); } } // downsamplePlanarImpl_uint8_t #define BufferType uint8_t #include "downsampler.gpu.incl" #undef BufferType // downsamplePlanarImpl_uint16_t #define BufferType uint16_t #include "downsampler.gpu.incl" #undef BufferType __global__ void downsamplePlanarKernel(global_mem unsigned char* __restrict__ dst, unsigned dstPitch, const global_mem unsigned char* __restrict__ src, unsigned srcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { downsamplePlanarImpl_uint8_t(dst, dstPitch, src, srcPitch, srcWidth, srcHeight, factor); } __global__ void downsampleYV12Kernel(global_mem unsigned char* __restrict__ yDst, unsigned yDstPitch, global_mem unsigned char* __restrict__ uDst, unsigned uDstPitch, global_mem unsigned char* __restrict__ vDst, unsigned vDstPitch, const global_mem unsigned char* __restrict__ ySrc, unsigned ySrcPitch, const global_mem unsigned char* __restrict__ uSrc, unsigned uSrcPitch, const global_mem unsigned char* __restrict__ vSrc, unsigned vSrcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { downsamplePlanarImpl_uint8_t(yDst, yDstPitch, ySrc, ySrcPitch, srcWidth, srcHeight, factor); downsamplePlanarImpl_uint8_t(uDst, uDstPitch, uSrc, uSrcPitch, srcWidth / 2, srcHeight / 2, factor); downsamplePlanarImpl_uint8_t(vDst, vDstPitch, vSrc, vSrcPitch, srcWidth / 2, srcHeight / 2, factor); } __device__ void downsampleInterleavedUV(global_mem unsigned char* __restrict__ dst, unsigned dstPitch, const global_mem unsigned char* __restrict__ src, unsigned srcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { // two interleaved dimensions, U and V // each thread manages two pixels of the destination const unsigned dstX = 2 * get_global_id_x(); const unsigned dstY = get_global_id_y(); const unsigned srcX = dstX * factor; const unsigned srcY = dstY * factor; if (srcX < srcWidth && srcY < srcHeight) { int32_t u = 0; int32_t v = 0; for (unsigned y = 0; y < factor; ++y) { for (unsigned x = 0; x < 2 * factor; x += 2) { unsigned uoffset = (srcY + y) * srcPitch + (srcX + x); unsigned voffset = uoffset + 1; u += src[uoffset]; v += src[voffset]; } } const int div = factor * factor; dst[dstY * dstPitch + dstX] = (unsigned char)(u / div); dst[dstY * dstPitch + dstX + 1] = (unsigned char)(v / div); } } __global__ void downsampleNV12Kernel(global_mem unsigned char* __restrict__ yDst, unsigned yDstPitch, global_mem unsigned char* __restrict__ uvDst, unsigned uvDstPitch, const global_mem unsigned char* __restrict__ ySrc, unsigned ySrcPitch, const global_mem unsigned char* __restrict__ uvSrc, unsigned uvSrcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { downsamplePlanarImpl_uint8_t(yDst, yDstPitch, ySrc, ySrcPitch, srcWidth, srcHeight, factor); downsampleInterleavedUV(uvDst, uvDstPitch, uvSrc, uvSrcPitch, srcWidth, srcHeight / 2, factor); } __global__ void downsampleYUV422P10Kernel(global_mem unsigned char* __restrict__ yDst, unsigned yDstPitch, global_mem unsigned char* __restrict__ uDst, unsigned uDstPitch, global_mem unsigned char* __restrict__ vDst, unsigned vDstPitch, const global_mem unsigned char* __restrict__ ySrc, unsigned ySrcPitch, const global_mem unsigned char* __restrict__ uSrc, unsigned uSrcPitch, const global_mem unsigned char* __restrict__ vSrc, unsigned vSrcPitch, unsigned srcWidth, unsigned srcHeight, unsigned factor) { downsamplePlanarImpl_uint16_t((global_mem uint16_t*)yDst, yDstPitch / 2, (const global_mem uint16_t*)ySrc, ySrcPitch / 2, srcWidth, srcHeight, factor); downsamplePlanarImpl_uint16_t((global_mem uint16_t*)uDst, uDstPitch / 2, (const global_mem uint16_t*)uSrc, uSrcPitch / 2, srcWidth / 2, srcHeight, factor); downsamplePlanarImpl_uint16_t((global_mem uint16_t*)vDst, vDstPitch / 2, (const global_mem uint16_t*)vSrc, vSrcPitch / 2, srcWidth / 2, srcHeight, factor); }