// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #include "frameBuffer.hpp" #include "common/container.hpp" #include "gpu/image/downsampler.hpp" #include "image/unpack.hpp" #include "gpu/buffer.hpp" #include "gpu/hostBuffer.hpp" #include "gpu/memcpy.hpp" #include "gpu/util.hpp" #include "libvideostitch/logging.hpp" #include "libvideostitch/profile.hpp" #include <algorithm> #include <iostream> #include <thread> // TODO_OPENCL_IMPL // backend dependent code needs to be moved to the backend itself #ifdef VS_OPENCL #include <backend/cl/deviceBuffer.hpp> #else #include <backend/cuda/deviceBuffer.hpp> #include <backend/cuda/deviceStream.hpp> #endif namespace VideoStitch { namespace Core { /** * The product of the stitching process is a panorama in full-scale, RGBA210 pixel format on the GPU. * * Yet the user callbacks can consume: * - non-RGBA210 panorama (eg. RGBA8888 !) * - downsampled panorama (eg. panorama stitched in 4K but streamed in 2K on the network) * - host-based buffer (eg. callbacks which are not OpenGL renderers and hardware encoders) * * The conversion process is as follow: * |panorama| -- colorspace conversion --> |1| -- downsampling --> |2| -- upload to host --> |final panorama| * * The design enforces that by covering all operations needed through: * - a map colorspace -> FormatBuffer inside the FrameBuffer * - a map downsampling ratio -> DownsamplingBuffer inside the FormatBuffer * * The conversion is done through FrameBuffer::pushVideo(), which will call FormatBuffer::pushVideo() for every * colorspace requested by the user. Then FormatBuffer::pushVideo() will call downsample() then * DownsamplingBuffer::pushVideo() for every downsampling ratio requested by the user. Inside it MemcpyAsync() will be * called in case a host-based panorama has been requested too. */ struct DownsamplingBuffer { DownsamplingBuffer(int32_t w, int32_t h, int ratio, PixelFormat fmt) : ratio(ratio), fmt(fmt), width(w), height(h) {} ~DownsamplingBuffer() { hostBuffers[0].release(); hostBuffers[1].release(); hostBuffers[2].release(); downBuf[0].release(); downBuf[1].release(); downBuf[2].release(); } Status pushVideo(GPU::Buffer2D* data, GPU::Stream stream) { if (ratio > 1) { Image::downsample(fmt, data, downBuf, stream); } else { downBuf[0].swap(data[0]); downBuf[1].swap(data[1]); downBuf[2].swap(data[2]); } // upload to host if (hostBuffers[0].hostPtr()) { PROPAGATE_FAILURE_STATUS(GPU::memcpyAsync(hostBuffers[0].hostPtr(), downBuf[0], stream)); } if (hostBuffers[1].hostPtr()) { PROPAGATE_FAILURE_STATUS(GPU::memcpyAsync(hostBuffers[1].hostPtr(), downBuf[1], stream)); } if (hostBuffers[2].hostPtr()) { PROPAGATE_FAILURE_STATUS(GPU::memcpyAsync(hostBuffers[2].hostPtr(), downBuf[2], stream)); } ready = true; return Status::OK(); } #define ALLOCATE_DOWNBUFFER(host, dev, width, height, writer) \ { \ if (writer->getExpectedOutputBufferType() == Host) { \ auto potHostBuffer = \ GPU::HostBuffer<unsigned char>::allocate(width * height, "Panorama Frame", GPUHostAllocPinned); \ PROPAGATE_FAILURE_STATUS(potHostBuffer.status()); \ host.release(); \ host = potHostBuffer.value(); \ } \ auto potDevBuffer = GPU::Buffer2D::allocate(width, height, "Panorama Frame"); \ PROPAGATE_FAILURE_STATUS(potDevBuffer.status()); \ dev.release(); \ dev = potDevBuffer.value(); \ } template <typename Writer> Status registerWriter(std::shared_ptr<Writer> writer) { switch (fmt) { case RGBA: case BGRU: case F32_C1: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width * 4, height, writer) break; case RGB: case BGR: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width * 3, height, writer) break; case UYVY: case YUY2: case Grayscale16: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width * 2, height, writer) break; case YV12: case DEPTH: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width, height, writer) ALLOCATE_DOWNBUFFER(hostBuffers[1], downBuf[1], width / 2, height / 2, writer) ALLOCATE_DOWNBUFFER(hostBuffers[2], downBuf[2], width / 2, height / 2, writer) break; case NV12: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width, height, writer) ALLOCATE_DOWNBUFFER(hostBuffers[1], downBuf[1], width, height / 2, writer) break; case YUV422P10: ALLOCATE_DOWNBUFFER(hostBuffers[0], downBuf[0], width * 2, height, writer) ALLOCATE_DOWNBUFFER(hostBuffers[1], downBuf[1], width, height, writer) ALLOCATE_DOWNBUFFER(hostBuffers[2], downBuf[2], width, height, writer) break; default: assert(false); return {Origin::Stitcher, ErrType::ImplementationError, "Unsupported colorspace for downsampling"}; } return Status::OK(); } Frame getFrame(AddressSpace addr) { Frame frame = {{nullptr, nullptr, nullptr}, {0, 0, 0}, width, height, -1, Unknown}; switch (addr) { case Host: frame.planes[0] = hostBuffers[0].hostPtr(); frame.pitches[0] = hostBuffers[0].byteSize() / frame.height; frame.planes[1] = hostBuffers[1].hostPtr(); frame.pitches[1] = hostBuffers[1].byteSize() / frame.height; frame.planes[2] = hostBuffers[2].hostPtr(); frame.pitches[2] = hostBuffers[2].byteSize() / frame.height; break; case Device: frame.planes[0] = downBuf[0].devicePtr(); frame.pitches[0] = downBuf[0].getPitch(); frame.planes[1] = downBuf[1].devicePtr(); frame.pitches[1] = downBuf[1].getPitch(); frame.planes[2] = downBuf[2].devicePtr(); frame.pitches[2] = downBuf[2].getPitch(); break; } return frame; } // This marker is used to avoid a race condition when a frame has been completely scheduled // for processing, but a new writer is registered while the GPU is being synchronized. // In that case, the frame buffer might miss eg. a new colorspace while the StitchOutput // expects to find it for the newly registered callback. // Thus, getBuffer might return null in this specific circumstance bool ready = false; int ratio; PixelFormat fmt; GPU::Buffer2D downBuf[3]; GPU::HostBuffer<unsigned char> hostBuffers[3]; int32_t width, height; private: DownsamplingBuffer(const DownsamplingBuffer&); DownsamplingBuffer& operator=(const DownsamplingBuffer&); }; // ---------------------- Colorspace conversion struct FormatBuffer { FormatBuffer(PixelFormat fmt, int64_t width, int64_t height) : pxFmt(fmt), width(width), height(height) {} ~FormatBuffer() { colorconvBuf[0].release(); colorconvBuf[1].release(); colorconvBuf[2].release(); deleteAllValues(downsamplers); } /** * Reads back the given device buffer and put in in the format buffer. */ template <typename Surface> Status pushVideo(Surface& data, GPU::Stream stream) { // convert colorspace switch (pxFmt) { case VideoStitch::PixelFormat::RGBA: PROPAGATE_FAILURE_STATUS(Image::unpackRGBA(colorconvBuf[0], data, width, height, stream)); break; case VideoStitch::PixelFormat::F32_C1: PROPAGATE_FAILURE_STATUS(Image::unpackF32C1(colorconvBuf[0], data, width, height, stream)); break; case VideoStitch::PixelFormat::Grayscale16: PROPAGATE_FAILURE_STATUS(Image::unpackGrayscale16(colorconvBuf[0], data, width, height, stream)); break; case VideoStitch::PixelFormat::DEPTH: PROPAGATE_FAILURE_STATUS( Image::unpackDepth(colorconvBuf[0], colorconvBuf[1], colorconvBuf[2], data, width, height, stream)); break; case VideoStitch::PixelFormat::RGB: { PROPAGATE_FAILURE_STATUS(Image::unpackRGB(colorconvBuf[0], data, width, height, stream)); break; } case VideoStitch::PixelFormat::YV12: { PROPAGATE_FAILURE_STATUS( Image::unpackYV12(colorconvBuf[0], colorconvBuf[1], colorconvBuf[2], data, width, height, stream)); break; } case VideoStitch::PixelFormat::NV12: { PROPAGATE_FAILURE_STATUS(Image::unpackNV12(colorconvBuf[0], colorconvBuf[1], data, width, height, stream)); break; } case VideoStitch::PixelFormat::UYVY: { PROPAGATE_FAILURE_STATUS(Image::unpackUYVY(colorconvBuf[0], data, width, height, stream)); break; } case VideoStitch::PixelFormat::YUY2: { PROPAGATE_FAILURE_STATUS(Image::unpackYUY2(colorconvBuf[0], data, width, height, stream)); break; } case VideoStitch::PixelFormat::YUV422P10: { PROPAGATE_FAILURE_STATUS( Image::unpackYUV422P10(colorconvBuf[0], colorconvBuf[1], colorconvBuf[2], data, width, height, stream)); break; } case VideoStitch::PixelFormat::Grayscale: return {Origin::Stitcher, ErrType::UnsupportedAction, "Stitching frames to grayscale writers not implemented"}; case VideoStitch::PixelFormat::BGRU: return {Origin::Stitcher, ErrType::UnsupportedAction, "Stitching frames to BGRU writers not implemented"}; case VideoStitch::PixelFormat::BGR: return {Origin::Stitcher, ErrType::UnsupportedAction, "Stitching frames to BGR writers not implemented"}; default: assert(false); } // downsample the result for (auto it = downsamplers.rbegin(); it != downsamplers.rend(); ++it) { PROPAGATE_FAILURE_STATUS(it->second->pushVideo(colorconvBuf, stream)); } return Status::OK(); } #define ALLOCATE_IMAGE(img, w, h) \ { \ PotentialValue<GPU::Buffer2D> pot = GPU::Buffer2D::allocate(w, h, "Colorspace conversion"); \ if (pot.ok()) { \ img.release(); \ img = pot.value(); \ } else { \ return pot.status(); \ } \ } template <typename Writer> Status registerWriter(std::shared_ptr<Writer> writer) { // allocate memory for colorspace conversion if (colorconvBuf[0].getWidth() == 0) { switch (pxFmt) { case RGBA: case BGRU: case F32_C1: ALLOCATE_IMAGE(colorconvBuf[0], width * 4, height) break; case RGB: case BGR: ALLOCATE_IMAGE(colorconvBuf[0], width * 3, height) break; case UYVY: case YUY2: case Grayscale16: ALLOCATE_IMAGE(colorconvBuf[0], width * 2, height) break; case YUV422P10: ALLOCATE_IMAGE(colorconvBuf[0], width * 2, height) ALLOCATE_IMAGE(colorconvBuf[1], width, height) ALLOCATE_IMAGE(colorconvBuf[2], width, height) break; case YV12: case DEPTH: ALLOCATE_IMAGE(colorconvBuf[0], width, height) ALLOCATE_IMAGE(colorconvBuf[1], width / 2, height / 2) ALLOCATE_IMAGE(colorconvBuf[2], width / 2, height / 2) break; case NV12: ALLOCATE_IMAGE(colorconvBuf[0], width, height) ALLOCATE_IMAGE(colorconvBuf[1], width, height / 2) break; default: assert(false); return {Origin::Stitcher, ErrType::ImplementationError, "Unsupported colorspace for output"}; } } // forward the writer to the target downsampler const size_t downsamplingRatio = width / writer->getPanoWidth(); DownsamplingBuffer*& db = downsamplers[downsamplingRatio]; if (db == nullptr) { db = new DownsamplingBuffer(writer->getWidth(), writer->getHeight(), (int)downsamplingRatio, writer->getPixelFormat()); } return downsamplers[downsamplingRatio]->registerWriter(writer); } Frame getFrame(AddressSpace addr, size_t ratio) { Frame f = downsamplers[ratio]->getFrame(addr); f.fmt = pxFmt; return f; } private: PixelFormat pxFmt; size_t width, height; // in pixels GPU::Buffer2D colorconvBuf[3]; std::map<size_t, DownsamplingBuffer*> downsamplers; }; // ---------------------- Frame buffer implementation template <typename Writer, typename Surface> Potential<FrameBuffer<Writer, Surface>> FrameBuffer<Writer, Surface>::create(std::shared_ptr<Surface> s) { return new FrameBuffer(s); } template <typename Writer, typename Surface> Potential<FrameBuffer<Writer, Surface>> FrameBuffer<Writer, Surface>::create(std::shared_ptr<Surface> s, std::shared_ptr<Writer> writer) { Potential<FrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } FAIL_RETURN(ret->registerWriter(writer)); return ret; } template <typename Writer, typename Surface> Potential<FrameBuffer<Writer, Surface>> FrameBuffer<Writer, Surface>::create( std::shared_ptr<Surface> s, const std::vector<std::shared_ptr<Writer>>& writers) { Potential<FrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } for (auto writer : writers) { FAIL_RETURN(ret->registerWriter(writer)); } return ret; } template <typename Writer, typename Surface> FrameBuffer<Writer, Surface>::FrameBuffer(std::shared_ptr<Surface> s) : date(-1), surf(s) {} template <typename Writer, typename Surface> FrameBuffer<Writer, Surface>::FrameBuffer(FrameBuffer<Writer, Surface>&& other) : date(-1), surf(other.surf) { { std::unique_lock<std::mutex> lock(other.mutex); std::swap(formatBuffers, other.formatBuffers); } } template <typename Writer, typename Surface> FrameBuffer<Writer, Surface>& FrameBuffer<Writer, Surface>::operator=(FrameBuffer<Writer, Surface>&& other) { if (this != &other) { std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> otherLock(other.mutex); std::swap(formatBuffers, other.formatBuffers); this->surf = other.surf; } return *this; } template <typename Writer, typename Surface> FrameBuffer<Writer, Surface>::~FrameBuffer() { deleteAllValues(formatBuffers); } template <typename Writer, typename Surface> Frame FrameBuffer<Writer, Surface>::getFrame(PixelFormat fmt, AddressSpace addr, size_t ratio) { return formatBuffers[fmt]->getFrame(addr, ratio); } template <typename Writer, typename Surface> Status FrameBuffer<Writer, Surface>::registerWriter(std::shared_ptr<Writer> writer) { // This should have been caught in Writer::create. assert((int)surf->getWidth() % (int)writer->getPanoWidth() == 0); std::unique_lock<std::mutex> lock(mutex); // Get or create the format buffer for the writer's format. FormatBuffer*& formatBuffer = formatBuffers[writer->getPixelFormat()]; // const int downsamplingFactor = (int)surf->getWidth() / (int)writer->getPanoWidth(); if (formatBuffer == nullptr) { // TODO: we should be able to simplify things to get rid of paddingTop in FormatBuffer. // Use the base padding (without downsampling). formatBuffer = new FormatBuffer(writer->getPixelFormat(), surf->getWidth(), surf->getHeight()); } return formatBuffer->registerWriter(writer); } // ---------------------- Implementations Potential<SourceFrameBuffer> SourceFrameBuffer::create(std::shared_ptr<SourceSurface> s) { return new SourceFrameBuffer(s); } Potential<SourceFrameBuffer> SourceFrameBuffer::create(std::shared_ptr<SourceSurface> s, std::shared_ptr<Output::VideoWriter> writer) { Potential<SourceFrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } FAIL_RETURN(ret->registerWriter(writer)); return ret; } Potential<SourceFrameBuffer> SourceFrameBuffer::create( std::shared_ptr<SourceSurface> s, const std::vector<std::shared_ptr<Output::VideoWriter>>& writers) { Potential<SourceFrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } for (auto writer : writers) { FAIL_RETURN(ret->registerWriter(writer)); } return ret; } Status SourceFrameBuffer::pushVideo() { std::unique_lock<std::mutex> lock(mutex); for (auto it = formatBuffers.begin(); it != formatBuffers.end(); ++it) { it->second->pushVideo(*surf->pimpl->surface, surf->pimpl->stream); } surf->pimpl->release(); return Status::OK(); } Status SourceFrameBuffer::pushOpenGLVideo() { return Status::OK(); } template <typename Writer> Potential<StitchFrameBuffer<Writer>> StitchFrameBuffer<Writer>::create(std::shared_ptr<PanoSurface> s) { return new StitchFrameBuffer(s); } template <typename Writer> Potential<StitchFrameBuffer<Writer>> StitchFrameBuffer<Writer>::create(std::shared_ptr<PanoSurface> s, std::shared_ptr<Writer> writer) { Potential<StitchFrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } FAIL_RETURN(ret->registerWriter(writer)); return ret; } template <typename Writer> Potential<StitchFrameBuffer<Writer>> StitchFrameBuffer<Writer>::create( std::shared_ptr<PanoSurface> s, const std::vector<std::shared_ptr<Writer>>& writers) { Potential<StitchFrameBuffer> ret = create(s); if (!ret.ok()) { return ret; } for (auto writer : writers) { FAIL_RETURN(ret->registerWriter(writer)); } return ret; } template <typename Writer> Status StitchFrameBuffer<Writer>::pushVideo() { std::unique_lock<std::mutex> lock(this->mutex); this->surf->pimpl->flatten(); for (auto it = this->formatBuffers.begin(); it != this->formatBuffers.end(); ++it) { it->second->pushVideo(this->surf->pimpl->buffer, this->surf->pimpl->stream); } return Status::OK(); } template <typename Writer> Status StitchFrameBuffer<Writer>::pushOpenGLVideo() { this->oglSurf->pimpl->acquireWriter(); this->oglSurf->pimpl->acquire(); std::unique_lock<std::mutex> lock(this->mutex); for (auto it = this->formatBuffers.begin(); it != this->formatBuffers.end(); ++it) { it->second->pushVideo(this->oglSurf->pimpl->buffer, this->oglSurf->pimpl->stream); } this->oglSurf->pimpl->releaseWriter(); this->oglSurf->pimpl->release(); return Status::OK(); } // explicit instantiations template class FrameBuffer<Output::VideoWriter, SourceSurface>; template class FrameBuffer<Output::VideoWriter, PanoSurface>; template class FrameBuffer<Output::StereoWriter, PanoSurface>; template class StitchFrameBuffer<Output::VideoWriter>; template class StitchFrameBuffer<Output::StereoWriter>; } // namespace Core } // namespace VideoStitch