// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#ifndef _BLURKERNEL_SMALL_SUPPORT_H_
#define _BLURKERNEL_SMALL_SUPPORT_H_

#include "backend/common/imageOps.hpp"

namespace VideoStitch {
namespace Image {
/**
 * Assumes 2 * radius < blockDim.x.
 */
#define gaussianBlur1DRGBA210SSKernelInterior POktmLly
template <uint32_t (*unrolledKernel)(const int32_t*)>
__global__ void gaussianBlur1DRGBA210SSKernelInterior(uint32_t* __restrict__ dst, const uint32_t* __restrict__ src,
                                                      int w, int h, const int r) {
  // load to shared mem we must load radius more pixels on the left and right
  const int rowOffset = blockIdx.x * blockDim.x + threadIdx.x;
  // shMem holds unpacked ARGBARGBARGB...
  extern __shared__ int32_t shMem[];
  int32_t* col = shMem + 4 * threadIdx.x;
  int32_t v = 0;
  if (rowOffset < w) {
    v = src[blockIdx.y * w + rowOffset];
  }
  col[0] = RGB210::a(v);
  col[1] = RGB210::r(v);
  col[2] = RGB210::g(v);
  col[3] = RGB210::b(v);
  if (rowOffset + blockDim.x < w && threadIdx.x < 2 * r) {
    v = src[blockIdx.y * w + rowOffset + blockDim.x];
    col[blockDim.x] = RGB210::a(v);
    col[blockDim.x + 1] = RGB210::r(v);
    col[blockDim.x + 2] = RGB210::g(v);
    col[blockDim.x + 3] = RGB210::b(v);
  }
  __syncthreads();

  // Compute convolution.
  if (rowOffset + 2 * r < w) {
    dst[blockIdx.y * w + r + threadIdx.x] = unrolledKernel(col);
  }
}

#define gaussianBlur1DRGBA210SSKernelWrap gepMfedS
template <uint32_t (*unrolledKernel)(const int32_t*)>
__global__ void gaussianBlur1DRGBA210SSKernelWrap(uint32_t* __restrict__ dst, const uint32_t* __restrict__ src, int w,
                                                  int h, const int r) {
  // Load the r pixels before the right boundary
  // Load the r pixels of the right boundary
  // Load the r pixels of the left boundary
  // Load the r pixels after the left bounary
  // If r == 2 the pattern is:
  // src is read by threads:
  //   0 1 2 3 - - - - - - - 4 5 6 7
  // and written to shared mem as:
  //   4 5 6 7 0 1 2 3
  extern __shared__ int32_t shMem[];
  int32_t* col;
  int32_t v;
  if (threadIdx.x < 2 * r) {
    v = src[blockIdx.y * w + threadIdx.x];
    col = shMem + 4 * (2 * r + threadIdx.x);
  } else if (threadIdx.x < 4 * r) {
    v = src[(blockIdx.y + 1) * w - 4 * r + threadIdx.x];
    col = shMem + 4 * (threadIdx.x - 2 * r);
  }
  col[0] = RGB210::a(v);
  col[1] = RGB210::r(v);
  col[2] = RGB210::g(v);
  col[3] = RGB210::b(v);
  __syncthreads();
  // Now threads 4 and 5 will compute blur for pixels 6 and 7, and 6 and 7 for 0 and 1
  if (threadIdx.x >= 2 * r) {
    dst[blockIdx.y * w + threadIdx.x - 3 * r + (threadIdx.x < 3 * r) * w] = unrolledKernel(col);
  }
}

#define gaussianBlur1DRGBA210SSKernelNoWrap fTHeRdki
template <uint32_t (*unrolledKernel)(const int32_t*)>
__global__ void gaussianBlur1DRGBA210SSKernelNoWrap(uint32_t* __restrict__ dst, const uint32_t* __restrict__ src, int w,
                                                    int h, const int r) {
  // Load the r pixels before the right boundary
  // Load the r pixels of the right boundary
  // Load the r pixels of the left boundary
  // Load the r pixels after the left bounary
  // If r == 2 the pattern is:
  // src is read by threads:
  //   0 1 2 3 - - - - - - - 4 5 6 7
  // and written to shared mem as:
  //   - - 0 1 2 3 4 5 6 7 - -
  //   0 0 0 1 2 3 4 5 6 7 7 7
  extern __shared__ int32_t shMem[];
  int32_t* col;
  int32_t v;
  if (threadIdx.x < 2 * r) {
    v = src[blockIdx.y * w + threadIdx.x];
    col = shMem + 4 * (2 * r + threadIdx.x);
  } else if (threadIdx.x < 4 * r) {
    v = src[(blockIdx.y + 1) * w - 4 * r + threadIdx.x];
    col = shMem + 4 * (threadIdx.x - 2 * r);
  }
  col[0] = RGB210::a(v);
  col[1] = RGB210::r(v);
  col[2] = RGB210::g(v);
  col[3] = RGB210::b(v);
  __syncthreads();
  // Now threads 4 and 5 will compute blur for pixels 6 and 7, and 6 and 7 for 0 and 1
  if (threadIdx.x >= 2 * r) {
    dst[blockIdx.y * w + threadIdx.x - 3 * r + (threadIdx.x < 3 * r) * w] = unrolledKernel(col);
  }
}
}  // namespace Image
}  // namespace VideoStitch
#endif