// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#pragma once

#include "libvideostitch/status.hpp"

#include <cuda_runtime.h>

#include <cassert>

namespace VideoStitch {
namespace Cuda {

struct Stream {
  Stream() : s(0) {}
  explicit Stream(cudaStream_t s) : s(s) {}

  cudaStream_t s;
};

/**
 * Returns int(ceil(v/d))
 */
template <typename IntT>
inline int64_t ceilDiv(int64_t v, IntT d) {
  const int64_t res = v / (int64_t)d;
  return res + (int64_t)(v - res * (int64_t)d > 0);  // add one is the remainder is nonzero
}

/**
 * Given a flat buffer; compute a 2D grid of threads lauchable by cuda.
 * This makes it possible to lauch grids with more than 65k blocks, which cannot be 1D with version < 3.
 * @param size Size of the flat buffer.
 * @param blockSize Number of threads in a block.
 *
 * Worst case uselessly launched blocks is:
 *    n(k) = 2 n(k-1) + 2^k  =>  n(k) = (k + 1) 2^k
 * with k the number of divisions by 2.
 * At the same time, the number of launched blocks is around:
 *    MAXGRIDDIM * 2^k
 * So the proportion of useless blocks is:
 *    (k + 1) 2^k / (MAXGRIDDIM * 2^k) = (k + 1) / MAXGRIDDIM
 * Which remains reasonable. If we go to incredibly large images, one idea would be to
 * factor as primes and build from the bottom up to avoid running empty blocks.
 *   l = prime_factors(ceilDiv(size))
 *   dim1 = 1
 *   while (dim1 * l.back() < MAXGRIDDIM) {
 *     dim1 *= l.back();
 *     l.pop();
 *   }
 */
dim3 compute2DGridForFlatBuffer(int64_t size, unsigned blockSize);

}  // namespace Cuda
}  // namespace VideoStitch