// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #include "../gpuKernelDef.h" #include "imageFormat.h" // change from CUDA: RGBDiff is value, not reference static inline float4 YRGBDiffToRGBA(unsigned char y, const int3 rgbDiff) { const int ya = (1192 * (y - 16)) >> 10; return (float4){clamp8(ya + rgbDiff.x) / 255.f, clamp8(ya + rgbDiff.y) / 255.f, clamp8(ya + rgbDiff.z) / 255.f, 1.f}; } #define nv12_surface_write surface_write_f #include "backend/common/image/unpack.gpu" static __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; kernel void unpackKernelRGB(global unsigned char* dst, unsigned pitch, global const unsigned int* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { unsigned int val = src[y * width + x]; dst[y * pitch + 3 * x] = (unsigned char)Image_RGBA_r(val); dst[y * pitch + 3 * x + 1] = (unsigned char)Image_RGBA_g(val); dst[y * pitch + 3 * x + 2] = (unsigned char)Image_RGBA_b(val); } } kernel void unpackKernelRGBSource(global unsigned char* dst, unsigned pitch, read_only image2d_t src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { float4 val = read_imagef(src, sampler, (int2)(x, y)); dst[y * pitch + 3 * x] = (unsigned char)(val.x * 255.f); dst[y * pitch + 3 * x + 1] = (unsigned char)(val.y * 255.f); dst[y * pitch + 3 * x + 2] = (unsigned char)(val.z * 255.f); } } kernel void unpackKernelGrayscaleSource(global unsigned char* dst, unsigned pitch, read_only image2d_t src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { float4 px = read_imagef(src, sampler, (int2)(x, y)); dst[y * pitch + x] = (unsigned char)clamp8((int)(65.481f * px.x + 128.553f * px.y + 24.966f * px.z + 16.5f)); } } kernel void unpackKernelRGBA(global unsigned char* dst, unsigned pitch, read_only image2d_t src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { float4 val = read_imagef(src, sampler, (int2)(x, y)); dst[y * pitch + 4 * x] = (unsigned char)(val.x * 255.f); dst[y * pitch + 4 * x + 1] = (unsigned char)(val.y * 255.f); dst[y * pitch + 4 * x + 2] = (unsigned char)(val.z * 255.f); dst[y * pitch + 4 * x + 3] = (unsigned char)(val.w * 255.f); } } kernel void unpackKernelF32C1(global float* dst, unsigned pitch, read_only image2d_t src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { float4 val = read_imagef(src, sampler, (int2)(x, y)); dst[y * pitch + x] = val.x; } } kernel void unpackKernelDepthSource(global unsigned char* yDst, unsigned yPitch, global unsigned char* uDst, unsigned uPitch, global unsigned char* vDst, unsigned vPitch, read_only image2d_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned x = 2 * (unsigned)get_global_id(0); unsigned y = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (x < width && y < height) { for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { float4 depth = read_imagef(src, sampler, (int2)(x + i, y + j)); // convert to millimeters and truncate unsigned int val = (unsigned int)min(convert_uint_sat_rtn(depth.x * 1000.f), (unsigned int)65279); // encode yDst[(y + j) * yPitch + x + i] = (unsigned char)(val / 256); int cu = val % 512; int cv = (val + 384) % 512; if (cu >= 256) { u += (unsigned char)(511 - cu); } else { u += (unsigned char)cu; } if (cv >= 256) { v += (unsigned char)(511 - cv); } else { v += (unsigned char)cv; } } } uDst[(y * uPitch + x) / 2] = (unsigned char)((u + 2) / 4); vDst[(y * vPitch + x) / 2] = (unsigned char)((v + 2) / 4); } } kernel void unpackKernelDepth(global unsigned char* yDst, unsigned yPitch, global unsigned char* uDst, unsigned uPitch, global unsigned char* vDst, unsigned vPitch, global const float* src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned x = 2 * (unsigned)get_global_id(0); unsigned y = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (x < width && y < height) { for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { float depth = src[(y + j) * width + x + i]; // convert to millimeters and truncate unsigned int val = (unsigned int)min(convert_uint_sat_rtn(depth * 1000.f), (unsigned int)65279); // encode yDst[(y + j) * yPitch + x + i] = (unsigned char)(val / 256); int cu = val % 512; int cv = (val + 384) % 512; if (cu >= 256) { u += (unsigned char)(511 - cu); } else { u += (unsigned char)cu; } if (cv >= 256) { v += (unsigned char)(511 - cv); } else { v += (unsigned char)cv; } } } uDst[(y * uPitch + x) / 2] = (unsigned char)((u + 2) / 4); vDst[(y * vPitch + x) / 2] = (unsigned char)((v + 2) / 4); } } /** * This kernel converts the buffer from RGBA to planar 12 bits 4:2:0 (YV12) out-of-place. * The conversion is undefined for pixels with 0 alpha. * * Y0 Y1 Y2 Y3 * ... * U0 U1 * ... * V0 V1 * ... */ kernel void unpackKernelYV12(global unsigned char* yDst, unsigned yPitch, global unsigned char* uDst, unsigned uPitch, global unsigned char* vDst, unsigned vPitch, global const unsigned int* src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (unsigned)get_global_id(0); unsigned sy = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (sx < width && sy < height) { { unsigned int val = src[sy * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { unsigned int val = src[sy * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { unsigned int val = src[(sy + 1) * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { unsigned int val = src[(sy + 1) * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uDst[(sy * uPitch + sx) / 2] = (unsigned char)(u / 4); vDst[(sy * vPitch + sx) / 2] = (unsigned char)(v / 4); } else { // border case with odd width / height if (sx + 1 < width) { unsigned int val = src[sy * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = (unsigned char)(u / 2); vDst[(sy * vPitch + sx) / 2] = (unsigned char)(v / 2); } } } barrier(CLK_GLOBAL_MEM_FENCE); if (sx == (width - 1) && sy + 1 < height) { unsigned int val = src[(sy + 1) * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * width + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * ((width + 1) / 2) + sx) / 2] = (unsigned char)(u / 2); vDst[(sy * ((width + 1) / 2) + sx) / 2] = (unsigned char)(v / 2); } } kernel void unpackKernelYV12Source(global unsigned char* yDst, unsigned yPitch, global unsigned char* uDst, unsigned uPitch, global unsigned char* vDst, unsigned vPitch, read_only image2d_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (unsigned)get_global_id(0); unsigned sy = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (sx < width && sy < height) { { float4 val = read_imagef(src, sampler, (int2)(sx, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { float4 val = read_imagef(src, sampler, (int2)(sx, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uDst[(sy * uPitch + sx) / 2] = (unsigned char)(u / 4); vDst[(sy * vPitch + sx) / 2] = (unsigned char)(v / 4); } else { // border case with odd width / height if (sx + 1 < width) { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = (unsigned char)(u / 2); vDst[(sy * vPitch + sx) / 2] = (unsigned char)(v / 2); } } } barrier(CLK_GLOBAL_MEM_FENCE); if (sx == (width - 1) && sy + 1 < height) { float4 val = read_imagef(src, sampler, (int2)(sx, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uDst[(sy * uPitch + sx) / 2] = (unsigned char)(u / 2); vDst[(sy * vPitch + sx) / 2] = (unsigned char)(v / 2); } } /** * This kernel converts the buffer from RGBA to interleaved 12 bits 4:2:0 (NV12) out-of-place. * The conversion is undefined for pixels with 0 alpha. * * Y0 Y1 Y2 Y3 * ... * U0 V0 U1 V1 * ... */ kernel void unpackKernelNV12(global unsigned char* yDst, unsigned yPitch, global unsigned char* uvDst, unsigned uvPitch, global const unsigned int* src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (unsigned)get_global_id(0); unsigned sy = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (sx < width && sy < height) { { unsigned int val = src[sy * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { unsigned int val = src[sy * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { unsigned int val = src[(sy + 1) * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { unsigned int val = src[(sy + 1) * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uvDst[(sy * uvPitch / 2) + sx] = (unsigned char)(u / 4); uvDst[(sy * uvPitch / 2) + sx + 1] = (unsigned char)(v / 4); } else { // border case with odd width / height if (sx + 1 < width) { unsigned int val = src[sy * width + sx + 1]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * width) / 2 + sx] = (unsigned char)(u / 2); uvDst[(sy * width) / 2 + sx + 1] = (unsigned char)(v / 2); } } } barrier(CLK_GLOBAL_MEM_FENCE); if (sx == (width - 1) && sy + 1 < height) { unsigned int val = src[(sy + 1) * width + sx]; int r = Image_RGBA_r(val); int g = Image_RGBA_g(val); int b = Image_RGBA_b(val); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * width + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * width) / 2 + sx] = (unsigned char)(u / 2); uvDst[(sy * width) / 2 + sx + 1] = (unsigned char)(v / 2); } } kernel void unpackKernelNV12Source(global unsigned char* yDst, unsigned yPitch, global unsigned char* uvDst, unsigned uvPitch, read_only image2d_t src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group unsigned sx = 2 * (unsigned)get_global_id(0); unsigned sy = 2 * (unsigned)get_global_id(1); int u = 0; int v = 0; if (sx < width && sy < height) { { float4 val = read_imagef(src, sampler, (int2)(sx, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } if (sx + 1 < width && sy + 1 < height) { // general case { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { float4 val = read_imagef(src, sampler, (int2)(sx, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; } uvDst[(sy * uvPitch / 2) + sx] = (unsigned char)(u / 4); uvDst[(sy * uvPitch / 2) + sx + 1] = (unsigned char)(v / 4); } else { // border case with odd width / height if (sx + 1 < width) { float4 val = read_imagef(src, sampler, (int2)(sx + 1, sy)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[sy * yPitch + sx + 1] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * width) / 2 + sx] = (unsigned char)(u / 2); uvDst[(sy * width) / 2 + sx + 1] = (unsigned char)(v / 2); } } } barrier(CLK_GLOBAL_MEM_FENCE); if (sx == (width - 1) && sy + 1 < height) { float4 val = read_imagef(src, sampler, (int2)(sx, sy + 1)); int r = (int)(val.x * 255.f); int g = (int)(val.y * 255.f); int b = (int)(val.z * 255.f); int y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yDst[(sy + 1) * yPitch + sx] = (unsigned char)y; u += ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; v += ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; uvDst[(sy * uvPitch) / 2 + sx] = (unsigned char)(u / 2); uvDst[(sy * uvPitch) / 2 + sx + 1] = (unsigned char)(v / 2); } } /** * This kernel converts the buffer from RGBA to 10 bits planar YUV422 out-of-place. * Pixels are all given full solidness (max alpha). * 10 bits values are padded to 16 bits. */ kernel void unpackYUV422P10Kernel(global unsigned short* yDst, unsigned yPitch, global unsigned short* uDst, unsigned uPitch, global unsigned short* vDst, unsigned vPitch, global const unsigned int* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width / 2 && y < height) { unsigned int val0 = src[y * width + 2 * x]; int r0 = Image_RGBA_r(val0); int g0 = Image_RGBA_g(val0); int b0 = Image_RGBA_b(val0); unsigned int val1 = src[y * width + 2 * x + 1]; int r1 = Image_RGBA_r(val1); int g1 = Image_RGBA_g(val1); int b1 = Image_RGBA_b(val1); unsigned int u = 0, v = 0; int y0 = (((66 * r0 + 129 * g0 + 25 * b0 + 128) >> 8) + 16) << 2; int y1 = (((66 * r1 + 129 * g1 + 25 * b1 + 128) >> 8) + 16) << 2; u += (((-38 * r0 - 74 * g0 + 112 * b0 + 128) >> 8) + 128) << 2; u += (((-38 * r1 - 74 * g1 + 112 * b1 + 128) >> 8) + 128) << 2; v += (((112 * r0 - 94 * g0 - 18 * b0 + 128) >> 8) + 128) << 2; v += (((112 * r1 - 94 * g1 - 18 * b1 + 128) >> 8) + 128) << 2; yDst[y * yPitch + 2 * x] = (unsigned short)y0; yDst[y * yPitch + 2 * x + 1] = (unsigned short)y1; uDst[y * uPitch + x] = (unsigned short)(u / 2); vDst[y * vPitch + x] = (unsigned short)(v / 2); } } /** * This kernel converts the buffer from planar 12 bits 4:2:0 (YV12) to packed RGBA8888 out-of-place. * All pixels are solid. */ kernel void convertYV12ToRGBAKernel(write_only image2d_t dst, global const unsigned char* src, unsigned width, unsigned height) { // each thread is responsible for a 2x2 pixel group const unsigned sx = (unsigned)get_global_id(0); const unsigned sy = (unsigned)get_global_id(1); global const unsigned char* uSrc = src + width * height; global const unsigned char* vSrc = uSrc + (width * height) / 4; if (sx < width / 2 && sy < height / 2) { const int3 rgbDiff = yuv444ToRGBDiff(uSrc[sy * (width / 2) + sx], vSrc[sy * (width / 2) + sx]); { int2 coords = {2 * sx, 2 * sy}; write_imagef(dst, coords, YRGBDiffToRGBA(src[coords.x + coords.y * width], rgbDiff)); } { int2 coords = {2 * sx + 1, 2 * sy}; write_imagef(dst, coords, YRGBDiffToRGBA(src[coords.x + coords.y * width], rgbDiff)); } { int2 coords = {2 * sx, 2 * sy + 1}; write_imagef(dst, coords, YRGBDiffToRGBA(src[coords.x + coords.y * width], rgbDiff)); } { int2 coords = {2 * sx + 1, 2 * sy + 1}; write_imagef(dst, coords, YRGBDiffToRGBA(src[coords.x + coords.y * width], rgbDiff)); } } } kernel void convertYUY2ToRGBAKernel(write_only image2d_t dst, global const unsigned char* src, unsigned width, unsigned height) { // each thread is responsible for a 2x1 pixel group // Two bytes per pixel. Y0 U Y1 V // Read 2x (y0), 2x+1 (u), 2x+2 (y1) 2x+3 (v) // Write x, x+1 // Repeat for every line const unsigned pitch = width * 2; const unsigned x = 2 * (unsigned)get_global_id(0); const unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { const unsigned char y0 = src[y * pitch + 2 * x]; // Two bytes per pixel. Y0 U Y1 V const unsigned char u = src[y * pitch + 2 * x + 1]; const unsigned char y1 = src[y * pitch + 2 * x + 2]; const unsigned char v = src[y * pitch + 2 * x + 3]; const int3 rgbDiff = yuv444ToRGBDiff(u, v); { int2 coords = {x, y}; write_imagef(dst, coords, YRGBDiffToRGBA(y0, rgbDiff)); } { int2 coords = {x + 1, y}; write_imagef(dst, coords, YRGBDiffToRGBA(y1, rgbDiff)); } } } kernel void convertRGB210ToRGBAKernel(write_only image2d_t dst, global const unsigned* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { unsigned v = src[y * width + x]; int2 coords = {x, y}; write_imagef(dst, coords, (float4)(clamp8(Image_RGB210_r(v)) / 255.f, clamp8(Image_RGB210_g(v)) / 255.f, clamp8(Image_RGB210_b(v)) / 255.f, Image_RGB210_a(v) / 255.f)); } } kernel void convertRGBToRGBAKernel(write_only image2d_t dst, global const unsigned char* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { write_imagef( dst, (int2)(x, y), (float4)(src[y * 3 * width + 3 * x], src[y * 3 * width + 3 * x + 1], src[y * 3 * width + 3 * x + 2], 255.f) / 255.f); } } /** * This kernel converts the buffer from 10 bits planar YUV422 to packed RGBA8888 out-of-place. * Each thread manages 2 pixels. * 10 bits values are padded to 16 bits, and are clamped to 8 bits during conversion * All pixels are solid. */ kernel void convertYUV422P10ToRGBAKernel(write_only image2d_t dst, global const unsigned short* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); global const unsigned short* uSrc = src + width * height; global const unsigned short* vSrc = uSrc + width * height / 2; if (x < width / 2 && y < height) { unsigned int y0 = (src[y * width + 2 * x]) >> 2; unsigned int y1 = (src[y * width + 2 * x + 1]) >> 2; unsigned int u = (uSrc[y * (width / 2) + x]) >> 2; unsigned int v = (vSrc[y * (width / 2) + x]) >> 2; const RGBDiff rgbDiff = yuv444ToRGBDiff((unsigned char)u, (unsigned char)v); write_imagef(dst, (int2)((2 * x), y), YRGBDiffToRGBA((unsigned char)y0, rgbDiff)); write_imagef(dst, (int2)((2 * x + 1), y), YRGBDiffToRGBA((unsigned char)y1, rgbDiff)); } } /** * This kernel converts the buffer from 8 bits monochrome Grayscale to packed RGBA8888 out-of-place. * Each thread manages 2 pixels. * All pixels are solid. */ // TODO Not tested kernel void convertGrayscaleToRGBAKernel(write_only image2d_t dst, global const unsigned short* src, unsigned width, unsigned height) { unsigned x = (unsigned)get_global_id(0); unsigned y = (unsigned)get_global_id(1); if (x < width && y < height) { unsigned int lum = *(src + width * y + x); float4 rgbaVal = (float4)(lum, lum, lum, 1.0f); write_imagef(dst, (int2)((4 * x), y), rgbaVal); } }