sampling.gpu 15 KB
// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#define BLOCK_SIZE 16

// ------------------- Subsampling

/**
 * Subsample src by a factor of two on each dimension and put it in into dst.
 * src has size (srcWidth * srcHeight), dst has size ((srcWidth + 1)/2 * (srcHeight + 1)/2)
 */
__global__ void subsample22RegularKernel(global_mem unsigned char* __restrict__ dst,
                                         global_mem const unsigned char* __restrict__ src, unsigned srcWidth,
                                         unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth && srcY + 1 < srcHeight) {
    float v = src[srcY * srcWidth + srcX];
    v += src[srcY * srcWidth + srcX + 1];
    v += src[(srcY + 1) * srcWidth + srcX];
    v += src[(srcY + 1) * srcWidth + srcX + 1];
    dst[dstY * dstWidth + dstX] = (unsigned char)(v / 4.0f);
  }
}

__global__ void subsample22RegularKernelFloat2(global_mem float2* __restrict__ dst,
                                               global_mem const float2* __restrict__ src, unsigned srcWidth,
                                               unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth && srcY + 1 < srcHeight) {
    float2 v = src[srcY * srcWidth + srcX];
    v += src[srcY * srcWidth + srcX + 1];
    v += src[(srcY + 1) * srcWidth + srcX];
    v += src[(srcY + 1) * srcWidth + srcX + 1];
    dst[dstY * dstWidth + dstX] = v / 4.0f;
  }
}

/**
 * The pixels for this kernel have a bottom neighbour.
 */
__global__ void subsample22RightBoundaryKernel(global_mem unsigned char* __restrict__ dst,
                                               global_mem const unsigned char* __restrict__ src, unsigned srcWidth,
                                               unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = (srcWidth - 1) / 2;
  unsigned dstY = get_global_id_y();
  unsigned srcX = srcWidth - 1;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcY + 1 < srcHeight) {
    float v = src[srcY * srcWidth + srcX];
    v += src[(srcY + 1) * srcWidth + srcX];
    dst[dstY * dstWidth + dstX] = (unsigned char)(v / 2.0f);
  }
}

__global__ void subsample22RightBoundaryKernelFloat2(global_mem float2* __restrict__ dst,
                                                     global_mem const float2* __restrict__ src, unsigned srcWidth,
                                                     unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = (srcWidth - 1) / 2;
  unsigned dstY = get_global_id_y();
  unsigned srcX = srcWidth - 1;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcY + 1 < srcHeight) {
    float2 v = src[srcY * srcWidth + srcX];
    v += src[(srcY + 1) * srcWidth + srcX];
    dst[dstY * dstWidth + dstX] = v / 2.0f;
  }
}

/**
 * The pixels for this kernel have a right neighbour
 */
__global__ void subsample22BottomBoundaryKernel(global_mem unsigned char* __restrict__ dst,
                                                global_mem const unsigned char* __restrict__ src, unsigned srcWidth,
                                                unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = (srcHeight - 1) / 2;
  unsigned srcX = dstX * 2;
  unsigned srcY = srcHeight - 1;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth) {
    float v = src[srcY * srcWidth + srcX];
    v += src[srcY * srcWidth + srcX + 1];
    dst[dstY * dstWidth + dstX] = (unsigned char)(v / 2.0f);
  }
}

__global__ void subsample22BottomBoundaryKernelFloat2(global_mem float2* __restrict__ dst,
                                                      global_mem const float2* __restrict__ src, unsigned srcWidth,
                                                      unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = (srcHeight - 1) / 2;
  unsigned srcX = dstX * 2;
  unsigned srcY = srcHeight - 1;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth) {
    float2 v = src[srcY * srcWidth + srcX];
    v += src[srcY * srcWidth + srcX + 1];
    dst[dstY * dstWidth + dstX] = v / 2.0f;
  }
}

__global__ void subsample22MaskRegularKernel(global_mem float2* __restrict__ dst,
                                             global_mem uint32_t* __restrict__ dstMask,
                                             global_mem const float2* __restrict__ src,
                                             global_mem const uint32_t* __restrict__ srcMask, unsigned srcWidth,
                                             unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth && srcY + 1 < srcHeight) {
    // Here, weight is used as a mask

    uint32_t w00 = srcMask[srcY * srcWidth + srcX];
    uint32_t w01 = srcMask[srcY * srcWidth + srcX + 1];
    uint32_t w10 = srcMask[(srcY + 1) * srcWidth + srcX];
    uint32_t w11 = srcMask[(srcY + 1) * srcWidth + srcX + 1];
    if (w00 == w01 && w01 == w10 && w10 == w11) {
      float2 v = src[srcY * srcWidth + srcX];
      v += src[srcY * srcWidth + srcX + 1];
      v += src[(srcY + 1) * srcWidth + srcX];
      v += src[(srcY + 1) * srcWidth + srcX + 1];
      dst[dstY * dstWidth + dstX] = v / 4.0f;
    } else {
      dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
    }
    dstMask[dstY * dstWidth + dstX] = w00;
  }
}

/**
 * The pixels for this kernel have a bottom neighbour.
 */
__global__ void subsample22MaskRightBoundaryKernel(global_mem float2* __restrict__ dst,
                                                   global_mem uint32_t* __restrict__ dstMask,
                                                   global_mem const float2* __restrict__ src,
                                                   global_mem const uint32_t* __restrict__ srcMask, unsigned srcWidth,
                                                   unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = (srcWidth - 1) / 2;
  unsigned dstY = get_global_id_y();
  unsigned srcX = srcWidth - 1;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcY + 1 < srcHeight) {
    uint32_t w00 = srcMask[srcY * srcWidth + srcX];
    uint32_t w10 = srcMask[(srcY + 1) * srcWidth + srcX];
    if (w00 == w10) {
      float2 v = src[srcY * srcWidth + srcX];
      v += src[(srcY + 1) * srcWidth + srcX];
      dst[dstY * dstWidth + dstX] = v / 2.0f;
    } else {
      dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
    }
    dstMask[dstY * dstWidth + dstX] = w00;
  }
}

/**
 * The pixels for this kernel have a right neighbour
 */
__global__ void subsample22MaskBottomBoundaryKernel(global_mem float2* __restrict__ dst,
                                                    global_mem uint32_t* __restrict__ dstMask,
                                                    global_mem const float2* __restrict__ src,
                                                    global_mem const uint32_t* __restrict__ srcMask, unsigned srcWidth,
                                                    unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = (srcHeight - 1) / 2;
  unsigned srcX = dstX * 2;
  unsigned srcY = srcHeight - 1;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth) {
    uint32_t w00 = srcMask[srcY * srcWidth + srcX];
    uint32_t w01 = srcMask[srcY * srcWidth + srcX + 1];
    if (w00 == w01) {
      float2 v = src[srcY * srcWidth + srcX];
      v += src[srcY * srcWidth + srcX + 1];
      dst[dstY * dstWidth + dstX] = v / 2.0f;
    } else {
      dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
    }
    dstMask[dstY * dstWidth + dstX] = w00;
  }
}

/**
 * Subsample src by a factor of two on each dimension and put it in into dst.
 * src has size (srcWidth * srcHeight), dst has size ((srcWidth + 1)/2 * (srcHeight + 1)/2)
 */
__global__ void subsample22NearestRegularKernel(global_mem unsigned char* __restrict__ dst,
                                                global_mem const unsigned char* __restrict__ src, unsigned srcWidth,
                                                unsigned srcHeight, unsigned dstWidth, unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth && srcY + 1 < srcHeight) {
    dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
  }
}

/**
 * The pixels for this kernel have a bottom neighbour.
 */
__global__ void subsample22NearestRightBoundaryKernel(global_mem unsigned char* __restrict__ dst,
                                                      global_mem const unsigned char* __restrict__ src,
                                                      unsigned srcWidth, unsigned srcHeight, unsigned dstWidth,
                                                      unsigned dstHeight) {
  unsigned dstX = (srcWidth - 1) / 2;
  unsigned dstY = get_global_id_y();
  unsigned srcX = srcWidth - 1;
  unsigned srcY = dstY * 2;
  if (dstX < dstWidth && dstY < dstHeight && srcY + 1 < srcHeight) {
    dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
  }
}

/**
 * The pixels for this kernel have a right neighbour
 */
__global__ void subsample22NearestBottomBoundaryKernel(global_mem unsigned char* __restrict__ dst,
                                                       global_mem const unsigned char* __restrict__ src,
                                                       unsigned srcWidth, unsigned srcHeight, unsigned dstWidth,
                                                       unsigned dstHeight) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = (srcHeight - 1) / 2;
  unsigned srcX = dstX * 2;
  unsigned srcY = srcHeight - 1;
  if (dstX < dstWidth && dstY < dstHeight && srcX + 1 < srcWidth) {
    dst[dstY * dstWidth + dstX] = src[srcY * srcWidth + srcX];
  }
}

/**
 * Same except that the alpha of the upsampled pixel is zero only if the alpha of all samples is 0.
 * There is an interior kernel that handles pixels with neighbours on the left, right and diagonal,
 * and a border kernel for each of the 3 remaining cases.
 */
__global__ void subsample22RGBARegularKernel(global_mem uint32_t* __restrict__ dst,
                                             global_mem const uint32_t* __restrict__ src, unsigned srcWidth,
                                             unsigned srcHeight, unsigned dstWidth) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (srcX + 1 < srcWidth && srcY + 1 < srcHeight) {
    int32_t a = 0, r = 0, g = 0, b = 0;
    uint32_t v = src[srcY * srcWidth + srcX];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    v = src[srcY * srcWidth + srcX + 1];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    v = src[(srcY + 1) * srcWidth + srcX];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    v = src[(srcY + 1) * srcWidth + srcX + 1];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    if (a) {
      dst[dstY * dstWidth + dstX] = Image_RGBA_pack(r / a, g / a, b / a, 0xff);
    } else {
      dst[dstY * dstWidth + dstX] = 0;
    }
  }
}

/**
 * The pixels for this kernel have a bottom neighbour.
 */
__global__ void subsample22RGBARightBoundaryKernel(global_mem uint32_t* __restrict__ dst,
                                                   global_mem const uint32_t* __restrict__ src, unsigned srcWidth,
                                                   unsigned srcHeight, unsigned dstWidth) {
  unsigned dstX = get_global_id_x() + srcWidth / 2;
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (srcY + 1 < srcHeight) {
    int32_t a = 0, r = 0, g = 0, b = 0;
    uint32_t v = src[srcY * srcWidth + srcX];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    v = src[(srcY + 1) * srcWidth + srcX];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    if (a) {
      dst[dstY * dstWidth + dstX] = Image_RGBA_pack(r / a, g / a, b / a, 0xff);
    } else {
      dst[dstY * dstWidth + dstX] = 0;
    }
  }
}

/**
 * The pixels for this kernel have a right neighbour
 */
__global__ void subsample22RGBABottomBoundaryKernel(global_mem uint32_t* __restrict__ dst,
                                                    global_mem const uint32_t* __restrict__ src, unsigned srcWidth,
                                                    unsigned srcHeight, unsigned dstWidth) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y() + srcHeight / 2;
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (srcX + 1 < srcWidth) {
    int32_t a = 0, r = 0, g = 0, b = 0;
    uint32_t v = src[srcY * srcWidth + srcX];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    v = src[srcY * srcWidth + srcX + 1];
    if (Image_RGBA_a(v)) {
      ++a;
      r += Image_RGBA_r(v);
      g += Image_RGBA_g(v);
      b += Image_RGBA_b(v);
    }
    if (a) {
      dst[dstY * dstWidth + dstX] = Image_RGBA_pack(r / a, g / a, b / a, 0xff);
    } else {
      dst[dstY * dstWidth + dstX] = 0;
    }
  }
}

// ---------------- Masks sampling

__global__ void subsampleMask22Kernel(global_mem unsigned char* __restrict__ dst,
                                      global_mem const unsigned char* __restrict__ src, unsigned srcWidth,
                                      unsigned srcHeight, unsigned dstWidth) {
  unsigned dstX = get_global_id_x();
  unsigned dstY = get_global_id_y();
  unsigned srcX = dstX * 2;
  unsigned srcY = dstY * 2;
  if (srcX < srcWidth && srcY < srcHeight) {
    unsigned char result = 0;
    if (src[srcY * srcWidth + srcX] == 1) {
      result = 1;
    }
    if (srcX + 1 < srcWidth && srcY < srcHeight && src[srcY * srcWidth + (srcX + 1)] == 1) {
      result = 1;
    }
    if (srcX < srcWidth && srcY + 1 < srcHeight && src[(srcY + 1) * srcWidth + srcX] == 1) {
      result = 1;
    }
    if (srcX + 1 < srcWidth && srcY + 1 < srcHeight && src[(srcY + 1) * srcWidth + (srcX + 1)] == 1) {
      result = 1;
    }
    dst[dstY * dstWidth + dstX] = result;
  }
}