// Copyright (c) 2012-2017 VideoStitch SAS
// Copyright (c) 2018 stitchEm

#include "memory.hpp"

#include "libvideostitch/logging.hpp"
#include "backend/common/allocStats.hpp"

#include <cuda_runtime.h>

#include "error.hpp"

#include <cassert>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
#include <mutex>

// If the following is defined, all cudaMallocXxxVS functions will bin memory allocation in named pools for memory
// debugging In that case, a global lock will be used which makes it possible to share pools between threads without
// caring about thread-safety.
#define USE_VS_MALLOC_POOLS

#undef max

namespace VideoStitch {
namespace Cuda {

#ifdef USE_VS_MALLOC_POOLS
namespace {
AllocStatsMap deviceStats("Device");
AllocStatsMap hostStats("Host");
}  // namespace
#endif  // USE_VS_MALLOC_POOLS

std::size_t getDevicePoolCurrentSize(void) {
#ifdef USE_VS_MALLOC_POOLS
  return deviceStats.bytesUsed();
#else
  return 0;
#endif
}

std::vector<std::size_t> getDevicePoolCurrentSizeByDevices(void) {
#ifdef USE_VS_MALLOC_POOLS
  return deviceStats.bytesUsedByDevices();
#else
  return std::vector<std::size_t>();
#endif
}

std::size_t getHostPoolCurrentSize(void) {
#ifdef USE_VS_MALLOC_POOLS
  return hostStats.bytesUsed();
#else
  return 0;
#endif
}

std::vector<std::size_t> getHostPoolCurrentSizeByDevices(void) {
#ifdef USE_VS_MALLOC_POOLS
  return hostStats.bytesUsedByDevices();
#else
  return std::vector<std::size_t>();
#endif
}

void printDevicePool() {
#ifdef USE_VS_MALLOC_POOLS
  deviceStats.print(std::cout);
#endif
}

void printHostPool() {
#ifdef USE_VS_MALLOC_POOLS
  hostStats.print(std::cout);
#endif
}

#ifndef NDEBUG
#define PRINT_FILELINE                                                                                 \
  if (file) {                                                                                          \
    Logger::get(VideoStitch::Logger::Error) << " (at " << file << ", l. " << line << ")" << std::endl; \
  };
#define FILELINE_ARGS const char *file, int line
#else
#define PRINT_FILELINE
#define FILELINE_ARGS const char* /*file*/, int /*line*/
#endif

Status __mallocVS(void** buf, size_t size, const char* name, unsigned /*flagsUnused*/, FILELINE_ARGS) {
  if (!CUDA_ERROR(cudaMalloc(buf, size)).ok()) {
    Logger::get(VideoStitch::Logger::Error) << "Could not allocate " << size << " bytes of GPU memory.";
    PRINT_FILELINE
    Logger::get(VideoStitch::Logger::Error) << std::endl;
    return {Origin::GPU, ErrType::OutOfResources, "Could not allocate GPU memory"};
  } else {
#ifdef USE_VS_MALLOC_POOLS
    deviceStats.addPtr(name, *buf, size);
#endif
    return Status::OK();
  }
}

Status freeVS(void* buf) {
#ifdef USE_VS_MALLOC_POOLS
  deviceStats.deletePtr(buf);
#endif
  if (buf) {
    return CUDA_ERROR(cudaFree(buf));
  } else {
    return Status::OK();
  }
}

Status __mallocHostVS(void** buf, size_t size, const char* name, unsigned flags, FILELINE_ARGS) {
  if (size == 0) {
    *buf = nullptr;
    return {Origin::GPU, ErrType::ImplementationError, "Cannot allocate 0-size buffer"};
  }

  if (!CUDA_ERROR(cudaHostAlloc(buf, size, flags)).ok()) {
    Logger::get(VideoStitch::Logger::Error) << "Could not allocate " << size << " bytes of pinned CPU memory.";
    PRINT_FILELINE
    Logger::get(VideoStitch::Logger::Error) << std::endl;
    return {Origin::GPU, ErrType::OutOfResources, "Could not allocate pinned memory"};
  } else {
#ifdef USE_VS_MALLOC_POOLS
    hostStats.addPtr(name, *buf, size);
#endif
    return Status::OK();
  }
}

Status freeHostVS(void* buf) {
#ifdef USE_VS_MALLOC_POOLS
  hostStats.deletePtr(buf);
#endif
  if (buf) {
    return CUDA_ERROR(cudaFreeHost(buf));
  } else {
    return Status::OK();
  }
}

Status __mallocArrayVS(struct cudaArray** array, const struct cudaChannelFormatDesc* desc, size_t width, size_t height,
                       unsigned int flags, const char* /*name*/, FILELINE_ARGS) {
  if (!CUDA_ERROR(cudaMallocArray(array, desc, width, height, flags)).ok()) {
    Logger::get(VideoStitch::Logger::Error)
        << "Could not allocate CUDA array of size " << width << " x " << height << ".";
    PRINT_FILELINE
    Logger::get(VideoStitch::Logger::Error) << std::endl;
    return {Origin::GPU, ErrType::OutOfResources, "Could not allocate CUDA array"};
  } else {
    return Status::OK();
  }
}

Status freeArrayVS(struct cudaArray* array) {
  if (array) {
    return CUDA_ERROR(cudaFreeArray(array));
  } else {
    return Status::OK();
  }
}

Status __mallocPrint(void** p, size_t size, const char* file, const int line) {
  VideoStitch::Logger::get(VideoStitch::Logger::Debug)
      << "Alloc " << size << " CUDA bytes (" << size / (1024 * 1024) << " MB) at " << file << ":" << line << std::endl;
  return CUDA_ERROR(cudaMalloc(p, size));
}
}  // namespace Cuda
}  // namespace VideoStitch