// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "parallax/simpleFlow.hpp"

#include "./kernels/patchDifferenceFunction.cu"

#include "backend/common/vectorOps.hpp"

#include "backend/cuda/deviceBuffer.hpp"
#include "backend/cuda/deviceStream.hpp"
#include "backend/cuda/core1/kernels/samplingKernel.cu"
#include "gpu/image/sampling.hpp"
#include "gpu/image/imageOps.hpp"
#include "gpu/image/blur.hpp"
#include "gpu/stream.hpp"
#include "cuda/error.hpp"
#include "cuda/util.hpp"
#include "util/imageProcessingGPUUtils.hpp"

namespace VideoStitch {
namespace Core {

#define TILE_WIDTH 16
#define CUDABLOCKSIZE 512
#define SIMPLEFLOW_KERNEL_BLOCK_SIZE_X 16
#define SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y 8

__global__ void forwardFlowKernel(const int flowSize, const int windowSize, const float flowMagnitudeWeight,
                                  const float gradientWeight, const int2 size0, const int2 offset0,
                                  const uint32_t* input0, const float* gradient0, const int2 size1, const int2 offset1,
                                  const uint32_t* input1, const float* gradient1, const float2* inputFlowOffset,
                                  float2* flow, float* confidence) {
  // Check whether we need to calculate the flow
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size0.x || y >= size0.y) return;

  uint32_t v0 = input0[y * size0.x + x];
  if (Image::RGBA::a(v0) == 0) {  // If current alpha is 0, do nothing
    if (inputFlowOffset) {
      flow[y * size0.x + x] = inputFlowOffset[y * size0.x + x];
      if (confidence) {
        confidence[y * size0.x + x] = 1;
      }
      return;
    }
    flow[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
    if (confidence) {
      confidence[y * size0.x + x] = 0;
    }
    return;
  }
  int2 coord1 = make_int2(x + offset0.x - offset1.x, y + offset0.y - offset1.y);
  if (!inRange(coord1, size1)) {
    flow[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
    if (confidence) {
      confidence[y * size0.x + x] = 0;
    }
    return;
  }
  uint32_t v1 = input1[coord1.y * size1.x + coord1.x];
  if (Image::RGBA::a(v1) == 0) {
    flow[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
    if (confidence) {
      confidence[y * size0.x + x] = 0;
    }
    return;
  }

  float2 flowOffset = make_float2(0, 0);
  if (inputFlowOffset) {
    flowOffset = inputFlowOffset[y * size0.x + x];
  }
  // Try to find the best forward flow here
  int2 sampleCoord = make_int2(x, y);
  float bestCost = MAX_INVALID_COST;
  float2 minFlow = make_float2(flowOffset.x, flowOffset.y);
  float totalCost = 0;
  float totalCount = 0;
  for (int i = -flowSize; i <= flowSize; i++)
    for (int j = -flowSize; j <= flowSize; j++) {
      int2 mapCoord =
          make_int2(flowOffset.x + x + i + offset0.x - offset1.x, flowOffset.y + y + j + offset0.y - offset1.y);
      if (inRange(mapCoord, size1)) {
        float cost = getCost(windowSize, gradientWeight, size0, input0, gradient0, sampleCoord, size1, input1,
                             gradient1, mapCoord) +
                     flowMagnitudeWeight * sqrtf(i * i + j * j) / sqrtf(2 * flowSize * flowSize);
        if (cost < MAX_INVALID_COST) {
          totalCost += cost;
          totalCount++;
        }
        if (cost < bestCost) {
          bestCost = cost;
          minFlow = make_float2(flowOffset.x + i, flowOffset.y + j);
        }
      }
    }

  flow[y * size0.x + x] = minFlow;

  if (confidence) {
    if (bestCost != MAX_INVALID_COST) {
      int2 mapCoord = make_int2(sampleCoord.x + minFlow.x + offset0.x - offset1.x,
                                sampleCoord.y + minFlow.y + offset0.y - offset1.y);
      confidence[y * size0.x + x] = (getCUR(windowSize, gradientWeight, size0, input0, gradient0, sampleCoord, size1,
                                            input1, gradient1, mapCoord));
      // confidence[y * size0.x + x] = totalCost / totalCount - bestCost;
    } else {
      confidence[y * size0.x + x] = 0;
    }
  }
}

__global__ void flowAgreementConfidenceKernel(const int flowSize, const int2 size0, const int2 offset0,
                                              const float2* flow0, const float* confidence0, const int2 size1,
                                              const int2 offset1, const float2* flow1, const float* confidence1,
                                              float* flowAgreementConfidence0) {
  // Check whether we need to calculate the agreement confidence
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size0.x || y >= size0.y) return;
  const float2 forwardFlow = flow0[y * size0.x + x];
  const float forwardConfidence = confidence0[y * size0.x + x];
  const int2 lookupCoord =
      make_int2(x + forwardFlow.x + offset0.x - offset1.x, y + forwardFlow.y + offset0.y - offset1.y);
  if (!inRange(lookupCoord, size1)) {
    flowAgreementConfidence0[y * size0.x + x] = 0;
    return;
  }
  const float2 backwardFlow = flow1[lookupCoord.y * size1.x + lookupCoord.x];
  const float backwardConfidence = confidence1[lookupCoord.y * size1.x + lookupCoord.x];

  // Check if forward and backward flow agree
  float normalizedAgreementLength =
      length(forwardFlow + backwardFlow) / (length(make_float2(2 * flowSize + 1, 2 * flowSize + 1)));
  flowAgreementConfidence0[y * size0.x + x] =
      powf(fmaxf(1 - normalizedAgreementLength, 0.0), 3)  // * forwardConfidence;
      * sqrtf(backwardConfidence * forwardConfidence);
}

__global__ void confidenceTransformKernel(const int width, const int height, const float threshold, const float gamma,
                                          const float clampedValue, const float* inputConfidence,
                                          float* outputConfidence) {
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= width || y >= height) return;
  float inputValue = inputConfidence[y * width + x];
  if (inputValue < threshold) {
    outputConfidence[y * width + x] = 0;
  } else {
    outputConfidence[y * width + x] = powf(inputValue, gamma);
  }
}

__device__ float getSpacialWeight(const float sigmaSpace, const float x) { return exp(-abs(sigmaSpace) * x * x); }

__global__ void confidenceAwareFlowBlurKernel(const bool extrapolation, const int2 size, const int kernelSize,
                                              const float sigmaSpace, const float sigmaImage,
                                              const float sigmaConfidence, const uint32_t* const inputImage,
                                              const float2* const inputFlow, const float* const inputConfidence,
                                              float2* const outputFlow) {
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size.x || y >= size.y) return;
  if (extrapolation) {
    float2 inFlow = inputFlow[y * size.x + x];
    if (inFlow.x != INVALID_FLOW_VALUE) {
      outputFlow[y * size.x + x] = inFlow;
      return;
    }
  }
  // check if the current flow is not valid, then just do nothing
  float maxDist = kernelSize * 1.4142;
  uint32_t imageColor;
  if (sigmaImage) {
    imageColor = inputImage[y * size.x + x];
  }
  if (!extrapolation) {
    if (sigmaImage > 0) {
      if (!Image::RGBA::a(imageColor)) {
        outputFlow[y * size.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
        return;
      }
    }
  }
  float sumWeight = 0;
  float2 sumContribution = make_float2(0, 0);

  if (maxDist == 0) maxDist = 1;
  for (int dx = -kernelSize; dx <= kernelSize; dx++) {
    for (int dy = -kernelSize; dy <= kernelSize; dy++) {
      // Here i came across a neighbor, what he is look like
      int2 neighborCoord = make_int2(x + dx, y + dy);
      if (!inRange(neighborCoord, size)) {
        continue;
      }
      float neighborConfidence = 1;
      if (inputConfidence) {
        neighborConfidence = inputConfidence[neighborCoord.y * size.x + neighborCoord.x];
      }
      if (neighborConfidence == 0) {
        continue;
      }

      float2 neighborflowOffset = inputFlow[neighborCoord.y * size.x + neighborCoord.x];
      if (neighborflowOffset.x == INVALID_FLOW_VALUE) {
        continue;
      }
      float weightImage = 1.0f;
      if (sigmaImage) {
        uint32_t imageColorNeighbor = inputImage[neighborCoord.y * size.x + neighborCoord.x];
        if (Image::RGBA::a(imageColorNeighbor) > 0 && Image::RGBA::a(imageColor) > 0) {
          const float sad = abs((float(Image::RGBA::r(imageColorNeighbor)) - Image::RGBA::r(imageColor)) / 255.0) +
                            abs((float(Image::RGBA::g(imageColorNeighbor)) - Image::RGBA::g(imageColor)) / 255.0) +
                            abs((float(Image::RGBA::b(imageColorNeighbor)) - Image::RGBA::b(imageColor)) / 255.0);
          weightImage = exp(-abs(sad * sad * sigmaImage));
        }
      }

      // Now calculate the distance between source and target
      float distSpace = length(make_float2(dx, dy)) / maxDist;
      float weightSpace = exp(-abs(distSpace * distSpace * sigmaSpace));
      // Now i do really look at the neighbor on the other side to see  how think is going on there
      float weight = weightSpace * weightImage * neighborConfidence;
      sumWeight += weight;
      sumContribution += weight * neighborflowOffset;
    }
  }

  // If my confidence is high, i would tend to keep mine, don't care about the neighbor's confidence
  // Here is where to set the weight
  if (sumWeight == 0) {
    outputFlow[y * size.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
  } else {
    outputFlow[y * size.x + x] = sumContribution / sumWeight;
  }
}

__global__ void flowConfidenceKernel(const int windowSize, const float gradientWeight, const int2 size0,
                                     const uint32_t* input0, const float* gradient0, const float2* inputFlow,
                                     const int2 size1, const uint32_t* input1, const float* gradient1,
                                     float* confidence) {
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size0.x || y >= size0.y) return;
  float2 flow = inputFlow[y * size0.x + x];
  if (flow.x == INVALID_FLOW_VALUE) {
    confidence[y * size0.x + x] = 0;
  } else {
    int2 mapCoord = make_int2(x + flow.x, y + flow.y);
    int2 sampleCoord = make_int2(x, y);
    confidence[y * size0.x + x] =
        getCUR(windowSize, gradientWeight, size0, input0, gradient0, sampleCoord, size1, input1, gradient1, mapCoord);
  }
}

Status SimpleFlow::findForwardFlow(const int flowSize, const int windowSize, const float flowMagnitudeWeight,
                                   const float gradientWeight, const int2 size0, const int2 offset0,
                                   const GPU::Buffer<const uint32_t> inputBuffer0,
                                   const GPU::Buffer<const float> inputGradientBuffer0, const int2 size1,
                                   const int2 offset1, const GPU::Buffer<const uint32_t> inputBuffer1,
                                   const GPU::Buffer<const float> inputGradientBuffer1, GPU::Buffer<float2> flow,
                                   GPU::Buffer<float> confidence, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  forwardFlowKernel<<<dimGrid, dimBlock, 0, stream>>>(flowSize, windowSize, flowMagnitudeWeight, gradientWeight, size0,
                                                      offset0, inputBuffer0.get(), inputGradientBuffer0.get(), size1,
                                                      offset1, inputBuffer1.get(), inputGradientBuffer1.get(), 0,
                                                      flow.get(), confidence.get());

  return CUDA_STATUS;
}

__global__ void offsetCostKernel(const int2 flowOffset, const int flowSize, const float flowMagnitudeWeight,
                                 const float gradientWeight, const int2 size0, const int2 offset0,
                                 const uint32_t* input0, const float* gradient0, const int2 size1, const int2 offset1,
                                 const uint32_t* input1, const float* gradient1, float2* cost) {
  // Check whether we need to calculate the flow
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size0.x || y >= size0.y) return;

  uint32_t v0 = input0[y * size0.x + x];
  if (Image::RGBA::a(v0) == 0) {  // If current alpha is 0, do nothing
    cost[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
    return;
  }
  // Try to find the best forward flow here
  int2 sampleCoord = make_int2(x, y);
  int2 mapCoord = make_int2(flowOffset.x + x + offset0.x - offset1.x, flowOffset.y + y + offset0.y - offset1.y);
  cost[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
  if (inRange(mapCoord, size1)) {
    float difCost =
        getCost(0, gradientWeight, size0, input0, gradient0, sampleCoord, size1, input1, gradient1, mapCoord) +
        flowMagnitudeWeight * sqrtf(flowOffset.x * flowOffset.x + flowOffset.y * flowOffset.y) /
            sqrtf(2 * flowSize * flowSize);
    cost[y * size0.x + x] = make_float2(difCost, difCost);
  }
}

Status SimpleFlow::findOffsetCost(const int2 flowOffset, const int flowSize, const float flowMagnitudeWeight,
                                  const float gradientWeight, const int2 size0, const int2 offset0,
                                  const GPU::Buffer<const uint32_t> inputBuffer0,
                                  const GPU::Buffer<const float> inputGradientBuffer0, const int2 size1,
                                  const int2 offset1, const GPU::Buffer<const uint32_t> inputBuffer1,
                                  const GPU::Buffer<const float> inputGradientBuffer1, GPU::Buffer<float2> cost,
                                  GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  offsetCostKernel<<<dimGrid, dimBlock, 0, stream>>>(
      flowOffset, flowSize, flowMagnitudeWeight, gradientWeight, size0, offset0, inputBuffer0.get(),
      inputGradientBuffer0.get(), size1, offset1, inputBuffer1.get(), inputGradientBuffer1.get(), cost.get());

  return CUDA_STATUS;
}

__global__ void updateBestCostKernel(const int2 flowOffset, const int2 size0, const float2* cost, float* bestCost,
                                     float2* bestOffset) {
  // Check whether we need to calculate the flow
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size0.x || y >= size0.y) return;
  if (flowOffset.x == 123456) {
    bestCost[y * size0.x + x] = MAX_INVALID_COST;
    bestOffset[y * size0.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
  }
  if (cost[y * size0.x + x].x >= 0 && cost[y * size0.x + x].x < bestCost[y * size0.x + x]) {
    bestCost[y * size0.x + x] = cost[y * size0.x + x].x;
    bestOffset[y * size0.x + x] = make_float2(flowOffset.x, flowOffset.y);
  }
}

Status SimpleFlow::updateBestCost(const int2 flowOffset, const int2 size0, const GPU::Buffer<const float2> cost,
                                  GPU::Buffer<float> bestCost, GPU::Buffer<float2> bestOffset, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  updateBestCostKernel<<<dimGrid, dimBlock, 0, stream>>>(flowOffset, size0, cost.get(), bestCost.get(),
                                                         bestOffset.get());
  return CUDA_STATUS;
}

Status SimpleFlow::findBackwardAndForwardFlowAgreementConfidence(
    const int flowSize, const int2 size0, const int2 offset0, const GPU::Buffer<const float2> flow0,
    const GPU::Buffer<const float> confidence0, const int2 size1, const int2 offset1,
    const GPU::Buffer<const float2> flow1, const GPU::Buffer<const float> confidence1,
    GPU::Buffer<float> flowAgreementConfidence0, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);

  flowAgreementConfidenceKernel<<<dimGrid, dimBlock, 0, stream>>>(flowSize, size0, offset0, flow0.get(),
                                                                  confidence0.get(), size1, offset1, flow1.get(),
                                                                  confidence1.get(), flowAgreementConfidence0.get());

  return CUDA_STATUS;
}

Status SimpleFlow::performConfidenceTransform(const int width, const int height, const float threshold,
                                              const float gamma, const float clampedValue,
                                              const GPU::Buffer<const float> inputConfidence,
                                              GPU::Buffer<float> outputConfidence, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(width, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(height, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  confidenceTransformKernel<<<dimGrid, dimBlock, 0, stream>>>(width, height, threshold, gamma, clampedValue,
                                                              inputConfidence.get(), outputConfidence.get());

  return CUDA_STATUS;
}

Status SimpleFlow::findConfidence(const int windowSize, const float gradientWeight, const int2 size0,
                                  const GPU::Buffer<const uint32_t> input0, const GPU::Buffer<const float> gradient0,
                                  GPU::Buffer<const float2> forwardFlow0, const int2 size1,
                                  const GPU::Buffer<const uint32_t> input1, const GPU::Buffer<const float> gradient1,
                                  GPU::Buffer<float> confidence, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  flowConfidenceKernel<<<dimGrid, dimBlock, 0, stream>>>(windowSize, gradientWeight, size0, input0.get(),
                                                         gradient0.get(), forwardFlow0.get(), size1, input1.get(),
                                                         gradient1.get(), confidence.get());

  return CUDA_STATUS;
}

Status SimpleFlow::performConfidenceAwareFlowInterpolation(const bool extrapolation, const int2 size,
                                                           const int kernelSize, const float sigmaSpace,
                                                           const float sigmaImage, const float sigmaConfidence,
                                                           const GPU::Buffer<const uint32_t> inputImage,
                                                           const GPU::Buffer<const float2> inputFlow,
                                                           const GPU::Buffer<const float> inputConfidence,
                                                           GPU::Buffer<float2> outputFlow, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  if (inputConfidence.wasAllocated()) {
    confidenceAwareFlowBlurKernel<<<dimGrid, dimBlock, 0, stream>>>(
        extrapolation, size, kernelSize, sigmaSpace, sigmaImage, sigmaConfidence, inputImage.get(), inputFlow.get(),
        inputConfidence.get(), outputFlow.get());
  } else {
    confidenceAwareFlowBlurKernel<<<dimGrid, dimBlock, 0, stream>>>(extrapolation, size, kernelSize, sigmaSpace,
                                                                    sigmaImage, sigmaConfidence, inputImage.get(),
                                                                    inputFlow.get(), nullptr, outputFlow.get());
  }
  return CUDA_STATUS;
}

__global__ void temporalAwareFlowBlurKernel(const bool extrapolation, const int frameId, const int frameCount,
                                            const int2 size, const int kernelSize, const float sigmaSpace,
                                            const float sigmaImage, const float sigmaTime, const float* const frames,
                                            const uint32_t* const inputImages, const float2* const inputFlows,
                                            const float* const inputConfidences, float2* const outputFlow) {
  uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
  uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
  if (x >= size.x || y >= size.y) return;
  int baseOffset = frameId * size.x * size.y;
  if (extrapolation) {
    float2 inFlow = inputFlows[baseOffset + y * size.x + x];
    if (inFlow.x != INVALID_FLOW_VALUE) {
      outputFlow[y * size.x + x] = inFlow;
      return;
    }
  }
  // check if the current flow is not valid, then just do nothing
  float maxDist = kernelSize * 1.4142;
  uint32_t imageColor;
  if (sigmaImage) {
    imageColor = inputImages[baseOffset + y * size.x + x];
  }
  if (!extrapolation) {
    if (sigmaImage > 0) {
      if (!Image::RGBA::a(imageColor)) {
        outputFlow[y * size.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
        return;
      }
    }
  }
  float sumWeight = 0;
  float2 sumContribution = make_float2(0, 0);

  if (maxDist == 0) maxDist = 1;
  for (int t = 0; t < frameCount; t++)
    if (frames[t] >= 0) {
      for (int dx = -kernelSize; dx <= kernelSize; dx++)
        for (int dy = -kernelSize; dy <= kernelSize; dy++) {
          const int offset = t * size.x * size.y;
          // Here i came across a neighbor, what he is look like
          int2 neighborCoord = make_int2(x + dx, y + dy);
          if (!inRange(neighborCoord, size)) {
            continue;
          }
          float neighborConfidence = 1;
          if (inputConfidences) {
            neighborConfidence = inputConfidences[offset + neighborCoord.y * size.x + neighborCoord.x];
          }
          if (neighborConfidence == 0) {
            continue;
          }
          float2 neighborflowOffset = inputFlows[offset + neighborCoord.y * size.x + neighborCoord.x];
          if (neighborflowOffset.x == INVALID_FLOW_VALUE) {
            continue;
          }
          float weightImage = 1.0f;
          if (sigmaImage) {
            uint32_t imageColorNeighbor = inputImages[offset + neighborCoord.y * size.x + neighborCoord.x];
            if (Image::RGBA::a(imageColorNeighbor) > 0 && Image::RGBA::a(imageColor) > 0) {
              const float sad = abs((float(Image::RGBA::r(imageColorNeighbor)) - Image::RGBA::r(imageColor)) / 255.0) +
                                abs((float(Image::RGBA::g(imageColorNeighbor)) - Image::RGBA::g(imageColor)) / 255.0) +
                                abs((float(Image::RGBA::b(imageColorNeighbor)) - Image::RGBA::b(imageColor)) / 255.0);
              weightImage = exp(-abs(sad * sad * sigmaImage));
            }
          }
          // Now calculate the distance of time
          float distTime = float(frames[t] - frames[frameId]) / frameCount;
          float weightTime = exp(-abs(distTime * distTime * sigmaTime));
          // Now calculate the distance between source and target
          float distSpace = length(make_float2(dx, dy)) / maxDist;
          float weightSpace = exp(-abs(distSpace * distSpace * sigmaSpace));
          // Now i do really look at the neighbor on the other side to see  how think is going on there
          float weight = weightSpace * weightImage * weightTime * neighborConfidence;
          sumWeight += weight;
          sumContribution += weight * neighborflowOffset;
        }
    }
  // If my confidence is high, i would tend to keep mine, don't care about the neighbor's confidence
  // Here is where to set the weight
  if (sumWeight == 0) {
    outputFlow[y * size.x + x] = make_float2(INVALID_FLOW_VALUE, INVALID_FLOW_VALUE);
  } else {
    outputFlow[y * size.x + x] = sumContribution / sumWeight;
  }
}

Status SimpleFlow::performTemporalAwareFlowInterpolation(
    const bool extrapolation, const frameid_t frameId, const int2 size, const int kernelSize, const float sigmaSpace,
    const float sigmaImage, const float sigmaTime, const GPU::Buffer<const float> frames,
    const GPU::Buffer<const uint32_t> inputImages, const GPU::Buffer<const float2> inputFlows,
    const GPU::Buffer<const float> inputConfidences, GPU::Buffer<float2> outputFlow, GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);
  const int frameCount = (int)frames.numElements();
  temporalAwareFlowBlurKernel<<<dimGrid, dimBlock, 0, stream>>>(
      extrapolation, frameId, frameCount, size, kernelSize, sigmaSpace, sigmaImage, sigmaTime, frames.get(),
      inputImages.get(), inputFlows.get(), inputConfidences.get(), outputFlow.get());
  return CUDA_STATUS;
}

Status SimpleFlow::performFlowJittering(const int jitterSize, const int windowSize, const float flowMagnitudeWeight,
                                        const float gradientWeight, const int2 size0, const int2 offset0,
                                        const GPU::Buffer<const uint32_t> inputBuffer0,
                                        const GPU::Buffer<const float> inputGradientBuffer0, const int2 size1,
                                        const int2 offset1, const GPU::Buffer<const uint32_t> inputBuffer1,
                                        const GPU::Buffer<const float> inputGradientBuffer1,
                                        const GPU::Buffer<const float2> inputFlow, GPU::Buffer<float2> outputFlow,
                                        GPU::Stream gpuStream) {
  cudaStream_t stream = gpuStream.get();
  dim3 dimBlock(SIMPLEFLOW_KERNEL_BLOCK_SIZE_X, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y, 1);
  dim3 dimGrid((unsigned)Cuda::ceilDiv(size0.x, SIMPLEFLOW_KERNEL_BLOCK_SIZE_X),
               (unsigned)Cuda::ceilDiv(size0.y, SIMPLEFLOW_KERNEL_BLOCK_SIZE_Y), 1);

  forwardFlowKernel<<<dimGrid, dimBlock, 0, stream>>>(jitterSize, windowSize, flowMagnitudeWeight, gradientWeight,
                                                      size0, offset0, inputBuffer0.get(), inputGradientBuffer0.get(),
                                                      size1, offset1, inputBuffer1.get(), inputGradientBuffer1.get(),
                                                      inputFlow.get(), outputFlow.get(), 0);
  return CUDA_STATUS;
}

}  // namespace Core
}  // namespace VideoStitch