// Copyright (c) 2012-2017 VideoStitch SAS // Copyright (c) 2018 stitchEm #include "gpu/stream.hpp" #include "deviceStream.hpp" #include "deviceEvent.hpp" #include "cuda/error.hpp" #if (_MSC_VER && _MSC_VER < 1900) #include <mutex> #endif #include <cuda_runtime.h> namespace VideoStitch { namespace GPU { Stream::Stream() : pimpl(nullptr) {} Stream::Stream(cudaStream_t cudaStream) : Stream() { delete pimpl; pimpl = new DeviceStream(cudaStream); } void Stream::destroyDeprecatedCUDAWrapper() { delete pimpl; pimpl = nullptr; } #if (_MSC_VER && _MSC_VER < 1900) // C++11 magic statics supported in Visual Studio 2015 and later static std::mutex defaultStreamInitMutex; #endif Stream Stream::getDefault() { #if (_MSC_VER && _MSC_VER < 1900) std::lock_guard<std::mutex> initLock(defaultStreamInitMutex); #endif // CUDA default Stream is NULL, keep it that way for compatibility // might in the future just create any stream and merge implementation // with OpenCL cudaStream_t s = NULL; static Stream defaultStream = Stream(s); return defaultStream; } PotentialValue<Stream> Stream::create() { cudaStream_t cuStream; auto status = CUDA_ERROR(cudaStreamCreateWithFlags(&cuStream, cudaStreamNonBlocking)); if (status.ok()) { auto stream = Stream(); delete stream.pimpl; stream.pimpl = new DeviceStream(cuStream); return PotentialValue<Stream>(stream); } return PotentialValue<Stream>(status); } Status Stream::destroy() { assert(*this != getDefault()); if (!pimpl) { return {Origin::GPU, ErrType::ImplementationError, "Trying to destroy an uninitialized GPU Stream"}; } auto status = CUDA_ERROR(cudaStreamDestroy(*pimpl)); delete pimpl; pimpl = nullptr; return status; } PotentialValue<Event> Stream::recordEvent() const { cudaEvent_t event; FAIL_RETURN(CUDA_ERROR(cudaEventCreate(&event, cudaEventBlockingSync | cudaEventDisableTiming))); FAIL_RETURN(CUDA_ERROR(cudaEventRecord(event, get()))); return Event::DeviceEvent::create(event); } Status Stream::synchronize() const { // Source tab on Vahana, on the stitching box (at least on the surface prototype, which is much better at GPU // occupancy), was yielding a 200% CPU usage, keeping 2 cores busy for nothing. The software was busy-waiting for the // GPU to complete its work, burning a lot of CPU in the process. Forcing waiting on event instead of the whole stream // seems like a hack, but the CPU usage dropped to 15%. Events have a fine granularity with regard to interaction with // the OS scheduler. Created with the BlockingSync flag, they will wait instead of spinning. This theoretically // increases the latency (think the order of a context-switch duration), but the trade-off is the CPU is completely // free to do something else in the meantime. cudaEvent_t event; FAIL_RETURN(CUDA_ERROR(cudaEventCreate(&event, cudaEventBlockingSync | cudaEventDisableTiming))); FAIL_RETURN(CUDA_ERROR(cudaEventRecord(event, get()))); FAIL_RETURN(CUDA_ERROR(cudaEventSynchronize(event))); return CUDA_ERROR(cudaEventDestroy(event)); } Status Stream::flush() const { return Status::OK(); } Status Stream::waitOnEvent(Event event) const { return CUDA_ERROR(cudaStreamWaitEvent(get(), *(event.get().event), 0)); } const Stream::DeviceStream& Stream::get() const { // Every Stream needs to be initialized properly assert(pimpl); return *pimpl; } bool Stream::operator==(const Stream& other) const { if (pimpl && other.pimpl) { return *pimpl == *other.pimpl; } return !pimpl && !other.pimpl; } } // namespace GPU } // namespace VideoStitch