From dd157c0c5e55c7980576f8df972f41e036a1abdb Mon Sep 17 00:00:00 2001 From: ns6089 <61738816+ns6089@users.noreply.github.com> Date: Sat, 19 Aug 2023 12:40:31 +0300 Subject: [PATCH] Initial implementation of yuv444 encoding --- src/nvenc/nvenc_base.cpp | 25 +- src/nvenc/nvenc_base.h | 10 +- src/nvenc/nvenc_d3d11.cpp | 54 --- src/nvenc/nvenc_d3d11.h | 21 +- src/nvenc/nvenc_d3d11_native.cpp | 67 ++++ src/nvenc/nvenc_d3d11_native.h | 28 ++ src/nvenc/nvenc_d3d11_on_cuda.cpp | 245 ++++++++++++ src/nvenc/nvenc_d3d11_on_cuda.h | 56 +++ src/nvenc/nvenc_utils.cpp | 12 + src/nvhttp.cpp | 10 +- src/platform/common.h | 6 + src/platform/windows/display_vram.cpp | 348 ++++++++++++++---- src/rtsp.cpp | 2 + src/video.cpp | 130 +++++-- src/video.h | 18 +- src/video_colorspace.cpp | 24 ++ src/video_colorspace.h | 2 + .../convert_yuv444_packed_ayuv_ps.hlsl | 3 + .../convert_yuv444_packed_ayuv_ps_linear.hlsl | 3 + .../directx/convert_yuv444_packed_vs.hlsl | 10 + .../convert_yuv444_packed_y410_ps.hlsl | 4 + .../convert_yuv444_packed_y410_ps_linear.hlsl | 4 + ...4_packed_y410_ps_perceptual_quantizer.hlsl | 4 + .../directx/convert_yuv444_planar_ps.hlsl | 4 + .../convert_yuv444_planar_ps_linear.hlsl | 4 + ...yuv444_planar_ps_perceptual_quantizer.hlsl | 4 + .../directx/convert_yuv444_planar_vs.hlsl | 36 ++ .../shaders/directx/include/base_vs.hlsl | 2 +- .../directx/include/base_vs_types.hlsl | 5 + .../include/convert_yuv444_ps_base.hlsl | 45 +++ third-party/moonlight-common-c | 2 +- 31 files changed, 996 insertions(+), 192 deletions(-) create mode 100644 src/nvenc/nvenc_d3d11_native.cpp create mode 100644 src/nvenc/nvenc_d3d11_native.h create mode 100644 src/nvenc/nvenc_d3d11_on_cuda.cpp create mode 100644 src/nvenc/nvenc_d3d11_on_cuda.h create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps_linear.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_vs.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_linear.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_perceptual_quantizer.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_linear.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_perceptual_quantizer.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/convert_yuv444_planar_vs.hlsl create mode 100644 src_assets/windows/assets/shaders/directx/include/convert_yuv444_ps_base.hlsl diff --git a/src/nvenc/nvenc_base.cpp b/src/nvenc/nvenc_base.cpp index b9eba5a04df..5d6d555ee61 100644 --- a/src/nvenc/nvenc_base.cpp +++ b/src/nvenc/nvenc_base.cpp @@ -81,9 +81,8 @@ namespace { namespace nvenc { - nvenc_base::nvenc_base(NV_ENC_DEVICE_TYPE device_type, void *device): - device_type(device_type), - device(device) { + nvenc_base::nvenc_base(NV_ENC_DEVICE_TYPE device_type): + device_type(device_type) { } nvenc_base::~nvenc_base() { @@ -172,7 +171,7 @@ namespace nvenc { }; auto buffer_is_yuv444 = [&]() { - return buffer_format == NV_ENC_BUFFER_FORMAT_YUV444 || buffer_format == NV_ENC_BUFFER_FORMAT_YUV444_10BIT; + return buffer_format == NV_ENC_BUFFER_FORMAT_AYUV || buffer_format == NV_ENC_BUFFER_FORMAT_YUV444_10BIT; }; { @@ -278,7 +277,7 @@ namespace nvenc { } }; - auto fill_h264_hevc_vui = [&colorspace](auto &vui_config) { + auto fill_h264_hevc_vui = [&](auto &vui_config) { vui_config.videoSignalTypePresentFlag = 1; vui_config.videoFormat = NV_ENC_VUI_VIDEO_FORMAT_UNSPECIFIED; vui_config.videoFullRangeFlag = colorspace.full_range; @@ -286,7 +285,7 @@ namespace nvenc { vui_config.colourPrimaries = colorspace.primaries; vui_config.transferCharacteristics = colorspace.tranfer_function; vui_config.colourMatrix = colorspace.matrix; - vui_config.chromaSampleLocationFlag = 1; + vui_config.chromaSampleLocationFlag = buffer_is_yuv444() ? 0 : 1; vui_config.chromaSampleLocationTop = 0; vui_config.chromaSampleLocationBot = 0; }; @@ -327,7 +326,9 @@ namespace nvenc { auto &format_config = enc_config.encodeCodecConfig.av1Config; format_config.repeatSeqHdr = 1; format_config.idrPeriod = NVENC_INFINITE_GOPLENGTH; - format_config.chromaFormatIDC = 1; // YUV444 not supported by NVENC yet + if (buffer_is_yuv444()) { + format_config.chromaFormatIDC = 3; + } format_config.enableBitstreamPadding = config.insert_filler_data; if (buffer_is_10bit()) { format_config.inputPixelBitDepthMinus8 = 2; @@ -337,7 +338,7 @@ namespace nvenc { format_config.transferCharacteristics = colorspace.tranfer_function; format_config.matrixCoefficients = colorspace.matrix; format_config.colorRange = colorspace.full_range; - format_config.chromaSamplePosition = 1; + format_config.chromaSamplePosition = buffer_is_yuv444() ? 0 : 1; set_ref_frames(format_config.maxNumRefFramesInDPB, format_config.numFwdRefs, 8); set_minqp_if_enabled(config.min_qp_av1); @@ -386,6 +387,7 @@ namespace nvenc { { std::string extra; if (init_params.enableEncodeAsync) extra += " async"; + if (buffer_is_yuv444()) extra += " yuv444"; if (buffer_is_10bit()) extra += " 10-bit"; if (enc_config.rcParams.multiPass != NV_ENC_MULTI_PASS_DISABLED) extra += " two-pass"; if (config.vbv_percentage_increase > 0 && get_encoder_cap(NV_ENC_CAPS_SUPPORT_CUSTOM_VBV_BUF_SIZE)) extra += " vbv+" + std::to_string(config.vbv_percentage_increase); @@ -435,6 +437,11 @@ namespace nvenc { assert(registered_input_buffer); assert(output_bitstream); + if (!synchronize_input_buffer()) { + BOOST_LOG(error) << "NvEnc: failed to synchronize input buffer"; + return {}; + } + NV_ENC_MAP_INPUT_RESOURCE mapped_input_buffer = { min_struct_version(NV_ENC_MAP_INPUT_RESOURCE_VER) }; mapped_input_buffer.registeredResource = registered_input_buffer; @@ -589,10 +596,12 @@ namespace nvenc { last_error_string.clear(); if (status != NV_ENC_SUCCESS) { + /* This API function gives broken strings more often than not if (nvenc && encoder) { last_error_string = nvenc->nvEncGetLastErrorString(encoder); if (!last_error_string.empty()) last_error_string += " "; } + */ last_error_string += status_string(status); return true; } diff --git a/src/nvenc/nvenc_base.h b/src/nvenc/nvenc_base.h index 2d012ef8da8..98f2c0e3652 100644 --- a/src/nvenc/nvenc_base.h +++ b/src/nvenc/nvenc_base.h @@ -13,7 +13,7 @@ namespace nvenc { class nvenc_base { public: - nvenc_base(NV_ENC_DEVICE_TYPE device_type, void *device); + nvenc_base(NV_ENC_DEVICE_TYPE device_type); virtual ~nvenc_base(); nvenc_base(const nvenc_base &) = delete; @@ -39,6 +39,9 @@ namespace nvenc { virtual bool create_and_register_input_buffer() = 0; + virtual bool + synchronize_input_buffer() { return true; } + virtual bool wait_for_async_event(uint32_t timeout_ms) { return false; } @@ -57,9 +60,6 @@ namespace nvenc { min_struct_version(uint32_t version, uint32_t v11_struct_version = 0, uint32_t v12_struct_version = 0); const NV_ENC_DEVICE_TYPE device_type; - void *const device; - - std::unique_ptr nvenc; void *encoder = nullptr; @@ -72,6 +72,8 @@ namespace nvenc { } encoder_params; // Derived classes set these variables + void *device = nullptr; + std::shared_ptr nvenc; NV_ENC_REGISTERED_PTR registered_input_buffer = nullptr; void *async_event_handle = nullptr; diff --git a/src/nvenc/nvenc_d3d11.cpp b/src/nvenc/nvenc_d3d11.cpp index cb33a1801af..301b6f064b8 100644 --- a/src/nvenc/nvenc_d3d11.cpp +++ b/src/nvenc/nvenc_d3d11.cpp @@ -1,31 +1,15 @@ -#include "src/logging.h" - #ifdef _WIN32 #include "nvenc_d3d11.h" - #include "nvenc_utils.h" - namespace nvenc { - nvenc_d3d11::nvenc_d3d11(ID3D11Device *d3d_device): - nvenc_base(NV_ENC_DEVICE_TYPE_DIRECTX, d3d_device), - d3d_device(d3d_device) { - } - nvenc_d3d11::~nvenc_d3d11() { - if (encoder) destroy_encoder(); - if (dll) { FreeLibrary(dll); dll = NULL; } } - ID3D11Texture2D * - nvenc_d3d11::get_input_texture() { - return d3d_input_texture.GetInterfacePtr(); - } - bool nvenc_d3d11::init_library() { if (dll) return true; @@ -64,43 +48,5 @@ namespace nvenc { return false; } - bool - nvenc_d3d11::create_and_register_input_buffer() { - if (!d3d_input_texture) { - D3D11_TEXTURE2D_DESC desc = {}; - desc.Width = encoder_params.width; - desc.Height = encoder_params.height; - desc.MipLevels = 1; - desc.ArraySize = 1; - desc.Format = dxgi_format_from_nvenc_format(encoder_params.buffer_format); - desc.SampleDesc.Count = 1; - desc.Usage = D3D11_USAGE_DEFAULT; - desc.BindFlags = D3D11_BIND_RENDER_TARGET; - if (d3d_device->CreateTexture2D(&desc, nullptr, &d3d_input_texture) != S_OK) { - BOOST_LOG(error) << "NvEnc: couldn't create input texture"; - return false; - } - } - - if (!registered_input_buffer) { - NV_ENC_REGISTER_RESOURCE register_resource = { min_struct_version(NV_ENC_REGISTER_RESOURCE_VER, 3, 4) }; - register_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_DIRECTX; - register_resource.width = encoder_params.width; - register_resource.height = encoder_params.height; - register_resource.resourceToRegister = d3d_input_texture.GetInterfacePtr(); - register_resource.bufferFormat = encoder_params.buffer_format; - register_resource.bufferUsage = NV_ENC_INPUT_IMAGE; - - if (nvenc_failed(nvenc->nvEncRegisterResource(encoder, ®ister_resource))) { - BOOST_LOG(error) << "NvEncRegisterResource failed: " << last_error_string; - return false; - } - - registered_input_buffer = register_resource.registeredResource; - } - - return true; - } - } // namespace nvenc #endif diff --git a/src/nvenc/nvenc_d3d11.h b/src/nvenc/nvenc_d3d11.h index ef1b8d4c232..a3d9d5a82e1 100644 --- a/src/nvenc/nvenc_d3d11.h +++ b/src/nvenc/nvenc_d3d11.h @@ -10,25 +10,24 @@ namespace nvenc { _COM_SMARTPTR_TYPEDEF(ID3D11Device, IID_ID3D11Device); _COM_SMARTPTR_TYPEDEF(ID3D11Texture2D, IID_ID3D11Texture2D); + _COM_SMARTPTR_TYPEDEF(IDXGIDevice, IID_IDXGIDevice); + _COM_SMARTPTR_TYPEDEF(IDXGIAdapter, IID_IDXGIAdapter); - class nvenc_d3d11 final: public nvenc_base { + class nvenc_d3d11: public nvenc_base { public: - nvenc_d3d11(ID3D11Device *d3d_device); - ~nvenc_d3d11(); + nvenc_d3d11(NV_ENC_DEVICE_TYPE device_type): + nvenc_base(device_type) {} - ID3D11Texture2D * - get_input_texture(); + virtual ~nvenc_d3d11(); - private: - bool - init_library() override; + virtual ID3D11Texture2D * + get_input_texture() = 0; + protected: bool - create_and_register_input_buffer() override; + init_library() override; HMODULE dll = NULL; - const ID3D11DevicePtr d3d_device; - ID3D11Texture2DPtr d3d_input_texture; }; } // namespace nvenc diff --git a/src/nvenc/nvenc_d3d11_native.cpp b/src/nvenc/nvenc_d3d11_native.cpp new file mode 100644 index 00000000000..158e4ca0296 --- /dev/null +++ b/src/nvenc/nvenc_d3d11_native.cpp @@ -0,0 +1,67 @@ +#ifdef _WIN32 + #include "nvenc_d3d11_native.h" + + #include "nvenc_utils.h" + +namespace nvenc { + + nvenc_d3d11_native::nvenc_d3d11_native(ID3D11Device *d3d_device): + nvenc_d3d11(NV_ENC_DEVICE_TYPE_DIRECTX), + d3d_device(d3d_device) { + device = d3d_device; + } + + nvenc_d3d11_native::~nvenc_d3d11_native() { + if (encoder) destroy_encoder(); + } + + ID3D11Texture2D * + nvenc_d3d11_native::get_input_texture() { + return d3d_input_texture.GetInterfacePtr(); + } + + bool + nvenc_d3d11_native::create_and_register_input_buffer() { + if (encoder_params.buffer_format == NV_ENC_BUFFER_FORMAT_YUV444_10BIT) { + BOOST_LOG(error) << "NvEnc: 10-bit 4:4:4 encoding is incompatible with D3D11 surface formats, use CUDA interop"; + return false; + } + + if (!d3d_input_texture) { + D3D11_TEXTURE2D_DESC desc = {}; + desc.Width = encoder_params.width; + desc.Height = encoder_params.height; + desc.MipLevels = 1; + desc.ArraySize = 1; + desc.Format = dxgi_format_from_nvenc_format(encoder_params.buffer_format); + desc.SampleDesc.Count = 1; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_RENDER_TARGET; + if (d3d_device->CreateTexture2D(&desc, nullptr, &d3d_input_texture) != S_OK) { + BOOST_LOG(error) << "NvEnc: couldn't create input texture"; + return false; + } + } + + if (!registered_input_buffer) { + NV_ENC_REGISTER_RESOURCE register_resource = { min_struct_version(NV_ENC_REGISTER_RESOURCE_VER, 3, 4) }; + register_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_DIRECTX; + register_resource.width = encoder_params.width; + register_resource.height = encoder_params.height; + register_resource.resourceToRegister = d3d_input_texture.GetInterfacePtr(); + register_resource.bufferFormat = encoder_params.buffer_format; + register_resource.bufferUsage = NV_ENC_INPUT_IMAGE; + + if (nvenc_failed(nvenc->nvEncRegisterResource(encoder, ®ister_resource))) { + BOOST_LOG(error) << "NvEncRegisterResource failed: " << last_error_string; + return false; + } + + registered_input_buffer = register_resource.registeredResource; + } + + return true; + } + +} // namespace nvenc +#endif diff --git a/src/nvenc/nvenc_d3d11_native.h b/src/nvenc/nvenc_d3d11_native.h new file mode 100644 index 00000000000..e7f9f898f73 --- /dev/null +++ b/src/nvenc/nvenc_d3d11_native.h @@ -0,0 +1,28 @@ +#pragma once +#ifdef _WIN32 + + #include + #include + + #include "nvenc_d3d11.h" + +namespace nvenc { + + class nvenc_d3d11_native final: public nvenc_d3d11 { + public: + nvenc_d3d11_native(ID3D11Device *d3d_device); + ~nvenc_d3d11_native(); + + ID3D11Texture2D * + get_input_texture() override; + + private: + bool + create_and_register_input_buffer() override; + + const ID3D11DevicePtr d3d_device; + ID3D11Texture2DPtr d3d_input_texture; + }; + +} // namespace nvenc +#endif diff --git a/src/nvenc/nvenc_d3d11_on_cuda.cpp b/src/nvenc/nvenc_d3d11_on_cuda.cpp new file mode 100644 index 00000000000..aaee370f623 --- /dev/null +++ b/src/nvenc/nvenc_d3d11_on_cuda.cpp @@ -0,0 +1,245 @@ +#ifdef _WIN32 + #include "nvenc_d3d11_on_cuda.h" + + #include "nvenc_utils.h" + +namespace { + + template + struct push_context { + push_context(const T &cuda, CUcontext context): + cuda(cuda) { + if (cuda.cuCtxPushCurrent(context) == CUDA_SUCCESS) { + pushed_context = context; + } + } + + ~push_context() { + if (pushed_context) { + cuda.cuCtxPopCurrent(nullptr); + } + } + + operator bool() const { + return pushed_context != nullptr; + } + + CUcontext pushed_context = nullptr; + const T &cuda; + }; +} // namespace + +namespace nvenc { + + nvenc_d3d11_on_cuda::nvenc_d3d11_on_cuda(ID3D11Device *d3d_device): + nvenc_d3d11(NV_ENC_DEVICE_TYPE_CUDA), + d3d_device(d3d_device) { + } + + nvenc_d3d11_on_cuda::~nvenc_d3d11_on_cuda() { + if (encoder) destroy_encoder(); + + if (cuda_context) { + { + auto context = push_context(cuda_functions, cuda_context); + + if (cuda_d3d_input_texture) { + cuda_functions.cuGraphicsUnregisterResource(cuda_d3d_input_texture); + cuda_d3d_input_texture = nullptr; + } + + if (cuda_surface) { + cuda_functions.cuMemFree(cuda_surface); + cuda_surface = 0; + } + } + + cuda_functions.cuCtxDestroy(cuda_context); + cuda_context = nullptr; + } + + if (cuda_functions.dll) { + FreeLibrary(cuda_functions.dll); + cuda_functions = {}; + } + } + + ID3D11Texture2D * + nvenc_d3d11_on_cuda::get_input_texture() { + return d3d_input_texture.GetInterfacePtr(); + } + + bool + nvenc_d3d11_on_cuda::init_library() { + if (!nvenc_d3d11::init_library()) return false; + + auto dll_name = "nvcuda.dll"; + + if ((cuda_functions.dll = LoadLibraryEx(dll_name, NULL, LOAD_LIBRARY_SEARCH_SYSTEM32))) { + auto load_function = [&](T &location, auto symbol) -> bool { + location = (T) GetProcAddress(cuda_functions.dll, symbol); + return location != nullptr; + }; + if (!load_function(cuda_functions.cuInit, "cuInit") || + !load_function(cuda_functions.cuD3D11GetDevice, "cuD3D11GetDevice") || + !load_function(cuda_functions.cuCtxCreate, "cuCtxCreate_v2") || + !load_function(cuda_functions.cuCtxDestroy, "cuCtxDestroy_v2") || + !load_function(cuda_functions.cuCtxPushCurrent, "cuCtxPushCurrent_v2") || + !load_function(cuda_functions.cuCtxPopCurrent, "cuCtxPopCurrent_v2") || + !load_function(cuda_functions.cuMemAllocPitch, "cuMemAllocPitch_v2") || + !load_function(cuda_functions.cuMemFree, "cuMemFree_v2") || + !load_function(cuda_functions.cuGraphicsD3D11RegisterResource, "cuGraphicsD3D11RegisterResource") || + !load_function(cuda_functions.cuGraphicsUnregisterResource, "cuGraphicsUnregisterResource") || + !load_function(cuda_functions.cuGraphicsMapResources, "cuGraphicsMapResources") || + !load_function(cuda_functions.cuGraphicsUnmapResources, "cuGraphicsUnmapResources") || + !load_function(cuda_functions.cuGraphicsSubResourceGetMappedArray, "cuGraphicsSubResourceGetMappedArray") || + !load_function(cuda_functions.cuMemcpy2D, "cuMemcpy2D_v2")) { + BOOST_LOG(debug) << "Missing CUDA functions in " << dll_name; + FreeLibrary(cuda_functions.dll); + cuda_functions = {}; + } + } + else { + BOOST_LOG(debug) << "Couldn't load CUDA dynamic library " << dll_name; + } + + // TODO: document + static bool cuda_initialized = false; + if (!cuda_initialized && + cuda_functions.dll && + cuda_functions.cuInit(0) == CUDA_SUCCESS) { + cuda_initialized = true; + } + + if (cuda_functions.dll && cuda_initialized) { + IDXGIDevicePtr dxgi_device; + IDXGIAdapterPtr dxgi_adapter; + CUdevice cuda_device; + if (d3d_device && + SUCCEEDED(d3d_device->QueryInterface(IID_PPV_ARGS(&dxgi_device))) && + SUCCEEDED(dxgi_device->GetAdapter(&dxgi_adapter)) && + cuda_functions.cuD3D11GetDevice(&cuda_device, dxgi_adapter) == CUDA_SUCCESS && + cuda_functions.cuCtxCreate(&cuda_context, 0, cuda_device) == CUDA_SUCCESS) { + device = cuda_context; + } + } + + return device != nullptr; + } + + bool + nvenc_d3d11_on_cuda::create_and_register_input_buffer() { + if (encoder_params.buffer_format != NV_ENC_BUFFER_FORMAT_YUV444_10BIT) { + BOOST_LOG(error) << "NvEnc: CUDA interop is expected to be used only for 10-bit 4:4:4 encoding"; + return false; + } + + if (!d3d_input_texture) { + D3D11_TEXTURE2D_DESC desc = {}; + desc.Width = encoder_params.width; + desc.Height = encoder_params.height * 3; // Planar YUV + desc.MipLevels = 1; + desc.ArraySize = 1; + desc.Format = dxgi_format_from_nvenc_format(encoder_params.buffer_format); + desc.SampleDesc.Count = 1; + desc.Usage = D3D11_USAGE_DEFAULT; + desc.BindFlags = D3D11_BIND_RENDER_TARGET; + if (d3d_device->CreateTexture2D(&desc, nullptr, &d3d_input_texture) != S_OK) { + BOOST_LOG(error) << "NvEnc: couldn't create input texture"; + return false; + } + } + + if (!cuda_d3d_input_texture) { + auto context = push_context(cuda_functions, cuda_context); + if (!context) return false; + + if (cuda_functions.cuGraphicsD3D11RegisterResource( + &cuda_d3d_input_texture, + d3d_input_texture, + CU_GRAPHICS_REGISTER_FLAGS_NONE) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuGraphicsD3D11RegisterResource() failed"; + return false; + } + } + + if (!cuda_surface) { + auto context = push_context(cuda_functions, cuda_context); + if (!context) return false; + + if (cuda_functions.cuMemAllocPitch( + &cuda_surface, + &cuda_surface_pitch, + // Planar 16-bit YUV + encoder_params.width * 2, + encoder_params.height * 3, 16) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuMemAllocPitch() failed"; + cuda_functions.cuCtxPopCurrent(nullptr); + return false; + }; + } + + if (!registered_input_buffer) { + NV_ENC_REGISTER_RESOURCE register_resource = { min_struct_version(NV_ENC_REGISTER_RESOURCE_VER, 3, 4) }; + register_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; + register_resource.width = encoder_params.width; + register_resource.height = encoder_params.height; + register_resource.pitch = cuda_surface_pitch; + register_resource.resourceToRegister = (void *) cuda_surface; + register_resource.bufferFormat = encoder_params.buffer_format; + register_resource.bufferUsage = NV_ENC_INPUT_IMAGE; + + if (nvenc_failed(nvenc->nvEncRegisterResource(encoder, ®ister_resource))) { + BOOST_LOG(error) << "NvEncRegisterResource failed: " << last_error_string; + return false; + } + + registered_input_buffer = register_resource.registeredResource; + } + + return true; + } + + bool + nvenc_d3d11_on_cuda::synchronize_input_buffer() { + auto context = push_context(cuda_functions, cuda_context); + if (!context) return false; + + if (cuda_functions.cuGraphicsMapResources(1, &cuda_d3d_input_texture, 0) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuGraphicsMapResources failed"; + return false; + } + + bool ok = true; + CUarray input_texture_array; + if (cuda_functions.cuGraphicsSubResourceGetMappedArray(&input_texture_array, cuda_d3d_input_texture, 0, 0) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuGraphicsSubResourceGetMappedArray failed"; + ok = false; + } + + if (ok) { + CUDA_MEMCPY2D copy_params = {}; + copy_params.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copy_params.srcArray = input_texture_array; + copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copy_params.dstDevice = cuda_surface; + copy_params.dstPitch = cuda_surface_pitch; + // Planar 16-bit YUV + copy_params.WidthInBytes = encoder_params.width * 2; + copy_params.Height = encoder_params.height * 3; + if (cuda_functions.cuMemcpy2D(©_params) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuMemcpy2D failed"; + ok = false; + } + } + + if (cuda_functions.cuGraphicsUnmapResources(1, &cuda_d3d_input_texture, 0) != CUDA_SUCCESS) { + BOOST_LOG(error) << "cuGraphicsUnmapResources failed"; + ok = false; + }; + + return ok; + } + +} // namespace nvenc +#endif diff --git a/src/nvenc/nvenc_d3d11_on_cuda.h b/src/nvenc/nvenc_d3d11_on_cuda.h new file mode 100644 index 00000000000..c218070c6bd --- /dev/null +++ b/src/nvenc/nvenc_d3d11_on_cuda.h @@ -0,0 +1,56 @@ +#pragma once +#ifdef _WIN32 + + #include "nvenc_d3d11.h" + + #include + +namespace nvenc { + + class nvenc_d3d11_on_cuda final: public nvenc_d3d11 { + public: + nvenc_d3d11_on_cuda(ID3D11Device *d3d_device); + ~nvenc_d3d11_on_cuda(); + + ID3D11Texture2D * + get_input_texture() override; + + private: + bool + init_library() override; + + bool + create_and_register_input_buffer() override; + + bool + synchronize_input_buffer() override; + + HMODULE dll = NULL; + const ID3D11DevicePtr d3d_device; + ID3D11Texture2DPtr d3d_input_texture; + + struct { + tcuInit *cuInit; + tcuD3D11GetDevice *cuD3D11GetDevice; + tcuCtxCreate_v2 *cuCtxCreate; + tcuCtxDestroy_v2 *cuCtxDestroy; + tcuCtxPushCurrent_v2 *cuCtxPushCurrent; + tcuCtxPopCurrent_v2 *cuCtxPopCurrent; + tcuMemAllocPitch_v2 *cuMemAllocPitch; + tcuMemFree_v2 *cuMemFree; + tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource; + tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; + tcuGraphicsMapResources *cuGraphicsMapResources; + tcuGraphicsUnmapResources *cuGraphicsUnmapResources; + tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; + tcuMemcpy2D_v2 *cuMemcpy2D; + HMODULE dll; + } cuda_functions = {}; + CUcontext cuda_context = nullptr; + CUgraphicsResource cuda_d3d_input_texture = nullptr; + CUdeviceptr cuda_surface = 0; + size_t cuda_surface_pitch = 0; + }; + +} // namespace nvenc +#endif diff --git a/src/nvenc/nvenc_utils.cpp b/src/nvenc/nvenc_utils.cpp index 1b8b7ec9f10..3199c9972a0 100644 --- a/src/nvenc/nvenc_utils.cpp +++ b/src/nvenc/nvenc_utils.cpp @@ -14,6 +14,12 @@ namespace nvenc { case NV_ENC_BUFFER_FORMAT_NV12: return DXGI_FORMAT_NV12; + case NV_ENC_BUFFER_FORMAT_AYUV: + return DXGI_FORMAT_AYUV; + + case NV_ENC_BUFFER_FORMAT_YUV444_10BIT: + return DXGI_FORMAT_R16_UINT; + default: return DXGI_FORMAT_UNKNOWN; } @@ -29,6 +35,12 @@ namespace nvenc { case platf::pix_fmt_e::p010: return NV_ENC_BUFFER_FORMAT_YUV420_10BIT; + case platf::pix_fmt_e::ayuv: + return NV_ENC_BUFFER_FORMAT_AYUV; + + case platf::pix_fmt_e::yuv444p16: + return NV_ENC_BUFFER_FORMAT_YUV444_10BIT; + default: return NV_ENC_BUFFER_FORMAT_UNDEFINED; } diff --git a/src/nvhttp.cpp b/src/nvhttp.cpp index 70ca9bc79be..61c4e21d6a1 100644 --- a/src/nvhttp.cpp +++ b/src/nvhttp.cpp @@ -697,12 +697,20 @@ namespace nvhttp { if (video::active_hevc_mode >= 3) { codec_mode_flags |= SCM_HEVC_MAIN10; } + if (video::active_hevc_mode >= 4) { + codec_mode_flags |= SCM_HEVC_REXT8_444; + codec_mode_flags |= SCM_HEVC_REXT10_444; + } if (video::active_av1_mode >= 2) { codec_mode_flags |= SCM_AV1_MAIN8; } if (video::active_av1_mode >= 3) { codec_mode_flags |= SCM_AV1_MAIN10; } + if (video::active_av1_mode >= 4) { + codec_mode_flags |= SCM_AV1_HIGH8_444; + codec_mode_flags |= SCM_AV1_HIGH10_444; + } tree.put("root.ServerCodecModeSupport", codec_mode_flags); pt::ptree display_nodes; @@ -763,7 +771,7 @@ namespace nvhttp { for (auto &proc : proc::proc.get_apps()) { pt::ptree app; - app.put("IsHdrSupported"s, video::active_hevc_mode == 3 ? 1 : 0); + app.put("IsHdrSupported"s, video::active_hevc_mode >= 3 ? 1 : 0); app.put("AppTitle"s, proc.name); app.put("ID", proc.id); diff --git a/src/platform/common.h b/src/platform/common.h index 007f7ece61b..04345f8af20 100644 --- a/src/platform/common.h +++ b/src/platform/common.h @@ -200,6 +200,9 @@ namespace platf { yuv420p10, nv12, p010, + ayuv, + yuv444p16, + y410, unknown }; @@ -214,6 +217,9 @@ namespace platf { _CONVERT(yuv420p10); _CONVERT(nv12); _CONVERT(p010); + _CONVERT(ayuv); + _CONVERT(yuv444p16); + _CONVERT(y410); _CONVERT(unknown); } #undef _CONVERT diff --git a/src/platform/windows/display_vram.cpp b/src/platform/windows/display_vram.cpp index 4aa1800ba45..da3e93d1be0 100644 --- a/src/platform/windows/display_vram.cpp +++ b/src/platform/windows/display_vram.cpp @@ -17,7 +17,8 @@ extern "C" { #include "src/config.h" #include "src/logging.h" #include "src/nvenc/nvenc_config.h" -#include "src/nvenc/nvenc_d3d11.h" +#include "src/nvenc/nvenc_d3d11_native.h" +#include "src/nvenc/nvenc_d3d11_on_cuda.h" #include "src/nvenc/nvenc_utils.h" #include "src/video.h" @@ -110,6 +111,16 @@ namespace platf::dxgi { blob_t convert_yuv420_planar_y_ps_linear_hlsl; blob_t convert_yuv420_planar_y_ps_perceptual_quantizer_hlsl; blob_t convert_yuv420_planar_y_vs_hlsl; + blob_t convert_yuv444_packed_ayuv_ps_hlsl; + blob_t convert_yuv444_packed_ayuv_ps_linear_hlsl; + blob_t convert_yuv444_packed_vs_hlsl; + blob_t convert_yuv444_planar_ps_hlsl; + blob_t convert_yuv444_planar_ps_linear_hlsl; + blob_t convert_yuv444_planar_ps_perceptual_quantizer_hlsl; + blob_t convert_yuv444_packed_y410_ps_hlsl; + blob_t convert_yuv444_packed_y410_ps_linear_hlsl; + blob_t convert_yuv444_packed_y410_ps_perceptual_quantizer_hlsl; + blob_t convert_yuv444_planar_vs_hlsl; blob_t cursor_ps_hlsl; blob_t cursor_ps_normalize_white_hlsl; blob_t cursor_vs_hlsl; @@ -402,18 +413,62 @@ namespace platf::dxgi { return -1; } - device_ctx->OMSetRenderTargets(1, &nv12_Y_rt, nullptr); - device_ctx->VSSetShader(scene_vs.get(), nullptr, 0); - device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_Y_fp16_ps.get() : convert_Y_ps.get(), nullptr, 0); - device_ctx->RSSetViewports(1, &outY_view); - device_ctx->PSSetShaderResources(0, 1, &img_ctx.encoder_input_res); - device_ctx->Draw(3, 0); + if (format == DXGI_FORMAT_AYUV || format == DXGI_FORMAT_Y410) { + device_ctx->OMSetRenderTargets(1, &yuv444_rt, nullptr); + device_ctx->VSSetShader(convert_YUV_vs.get(), nullptr, 0); + device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_YUV_fp16_ps.get() : convert_YUV_ps.get(), nullptr, 0); + + if (!rt_cleared) { + device_ctx->RSSetViewports(1, &outY_view_for_clear); + auto black = create_black_texture_for_render_target_clear(); + if (black) { + device_ctx->PSSetShaderResources(0, 1, &black); + device_ctx->Draw(3, 0); + } + rt_cleared = true; + } - device_ctx->OMSetRenderTargets(1, &nv12_UV_rt, nullptr); - device_ctx->VSSetShader(convert_UV_vs.get(), nullptr, 0); - device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_UV_fp16_ps.get() : convert_UV_ps.get(), nullptr, 0); - device_ctx->RSSetViewports(1, &outUV_view); - device_ctx->Draw(3, 0); + device_ctx->RSSetViewports(1, &outY_view); + device_ctx->PSSetShaderResources(0, 1, &img_ctx.encoder_input_res); + device_ctx->Draw(3, 0); + } + else if (format == DXGI_FORMAT_R16_UINT) { + device_ctx->OMSetRenderTargets(1, &yuv444_rt, nullptr); + device_ctx->VSSetShader(convert_YUV_vs.get(), nullptr, 0); + device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_YUV_fp16_ps.get() : convert_YUV_ps.get(), nullptr, 0); + + if (!rt_cleared) { + assert(outYUV_views_for_clear.size() == 3); + device_ctx->RSSetViewports(outYUV_views_for_clear.size(), outYUV_views_for_clear.data()); + auto black = create_black_texture_for_render_target_clear(); + if (black) { + device_ctx->PSSetShaderResources(0, 1, &black); + device_ctx->Draw(9, 0); // vertex shader spreads 9 vertices across 3 viewports + } + rt_cleared = true; + } + + assert(outYUV_views.size() == 3); + device_ctx->RSSetViewports(outYUV_views.size(), outYUV_views.data()); + device_ctx->PSSetShaderResources(0, 1, &img_ctx.encoder_input_res); + device_ctx->Draw(9, 0); // vertex shader spreads 9 vertices across 3 viewports + } + else { + assert(rt_cleared); + + device_ctx->OMSetRenderTargets(1, &nv12_Y_rt, nullptr); + device_ctx->VSSetShader(scene_vs.get(), nullptr, 0); + device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_Y_fp16_ps.get() : convert_Y_ps.get(), nullptr, 0); + device_ctx->RSSetViewports(1, &outY_view); + device_ctx->PSSetShaderResources(0, 1, &img_ctx.encoder_input_res); + device_ctx->Draw(3, 0); + + device_ctx->OMSetRenderTargets(1, &nv12_UV_rt, nullptr); + device_ctx->VSSetShader(convert_UV_vs.get(), nullptr, 0); + device_ctx->PSSetShader(img.format == DXGI_FORMAT_R16G16B16A16_FLOAT ? convert_UV_fp16_ps.get() : convert_UV_ps.get(), nullptr, 0); + device_ctx->RSSetViewports(1, &outUV_view); + device_ctx->Draw(3, 0); + } // Release encoder mutex to allow capture code to reuse this image img_ctx.encoder_mutex->ReleaseSync(0); @@ -434,12 +489,20 @@ namespace platf::dxgi { return; } - auto color_matrix = make_buffer(device.get(), *color_vectors); + auto modified_color_vectors = *color_vectors; + if (format == DXGI_FORMAT_AYUV || + format == DXGI_FORMAT_R16_UINT || + format == DXGI_FORMAT_Y410) { + modified_color_vectors = ::video::new_color_vectors_from_colorspace(colorspace); + } + + auto color_matrix = make_buffer(device.get(), modified_color_vectors); if (!color_matrix) { BOOST_LOG(warning) << "Failed to create color matrix"sv; return; } + device_ctx->VSSetConstantBuffers(3, 1, &color_matrix); device_ctx->PSSetConstantBuffers(0, 1, &color_matrix); this->color_matrix = std::move(color_matrix); } @@ -466,7 +529,14 @@ namespace platf::dxgi { auto offsetY = (out_height - out_height_f) / 2; outY_view = D3D11_VIEWPORT { offsetX, offsetY, out_width_f, out_height_f, 0.0f, 1.0f }; + outY_view_for_clear = D3D11_VIEWPORT { 0, 0, (float) out_width, (float) out_height, 0.0f, 1.0f }; outUV_view = D3D11_VIEWPORT { offsetX / 2, offsetY / 2, out_width_f / 2, out_height_f / 2, 0.0f, 1.0f }; + outYUV_views = { outY_view, outY_view, outY_view }; // Planar YUV + outYUV_views[1].TopLeftY += out_height; // U plane + outYUV_views[2].TopLeftY += 2 * out_height; // V plane + outYUV_views_for_clear = { outY_view_for_clear, outY_view_for_clear, outY_view_for_clear }; + outYUV_views_for_clear[1].TopLeftY += out_height; + outYUV_views_for_clear[2].TopLeftY += 2 * out_height; float subsample_offset_in[16 / sizeof(float)] { 1.0f / (float) out_width_f, 1.0f / (float) out_height_f }; // aligned to 16-byte subsample_offset = make_buffer(device.get(), subsample_offset_in); @@ -488,30 +558,69 @@ namespace platf::dxgi { device_ctx->VSSetConstantBuffers(1, 1, &rotation); } - D3D11_RENDER_TARGET_VIEW_DESC nv12_rt_desc { - format == DXGI_FORMAT_P010 ? DXGI_FORMAT_R16_UNORM : DXGI_FORMAT_R8_UNORM, - D3D11_RTV_DIMENSION_TEXTURE2D - }; + if (format == DXGI_FORMAT_AYUV) { + D3D11_RENDER_TARGET_VIEW_DESC rt_desc = {}; + rt_desc.Format = DXGI_FORMAT_R8G8B8A8_UINT; + rt_desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; - auto status = device->CreateRenderTargetView(output_texture.get(), &nv12_rt_desc, &nv12_Y_rt); - if (FAILED(status)) { - BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; + auto status = device->CreateRenderTargetView(output_texture.get(), &rt_desc, &yuv444_rt); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; + return -1; + } + rt_cleared = false; // can't use ClearRenderTargetView(), will clear on first convert() } + else if (format == DXGI_FORMAT_R16_UINT) { + D3D11_RENDER_TARGET_VIEW_DESC rt_desc = {}; + rt_desc.Format = DXGI_FORMAT_R16_UINT; + rt_desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; - nv12_rt_desc.Format = (format == DXGI_FORMAT_P010) ? DXGI_FORMAT_R16G16_UNORM : DXGI_FORMAT_R8G8_UNORM; + auto status = device->CreateRenderTargetView(output_texture.get(), &rt_desc, &yuv444_rt); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; + return -1; + } + rt_cleared = false; // can't use ClearRenderTargetView(), will clear on first convert() + } + else if (format == DXGI_FORMAT_Y410) { + D3D11_RENDER_TARGET_VIEW_DESC rt_desc = {}; + rt_desc.Format = DXGI_FORMAT_R10G10B10A2_UINT; + rt_desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; - status = device->CreateRenderTargetView(output_texture.get(), &nv12_rt_desc, &nv12_UV_rt); - if (FAILED(status)) { - BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; + auto status = device->CreateRenderTargetView(output_texture.get(), &rt_desc, &yuv444_rt); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; + return -1; + } + rt_cleared = false; // can't use ClearRenderTargetView(), will clear on first convert() } + else { + D3D11_RENDER_TARGET_VIEW_DESC nv12_rt_desc { + format == DXGI_FORMAT_P010 ? DXGI_FORMAT_R16_UNORM : DXGI_FORMAT_R8_UNORM, + D3D11_RTV_DIMENSION_TEXTURE2D + }; + + auto status = device->CreateRenderTargetView(output_texture.get(), &nv12_rt_desc, &nv12_Y_rt); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; + return -1; + } + + nv12_rt_desc.Format = (format == DXGI_FORMAT_P010) ? DXGI_FORMAT_R16G16_UNORM : DXGI_FORMAT_R8G8_UNORM; - // Clear the RTVs to ensure the aspect ratio padding is black - const float y_black[] = { 0.0f, 0.0f, 0.0f, 0.0f }; - device_ctx->ClearRenderTargetView(nv12_Y_rt.get(), y_black); - const float uv_black[] = { 0.5f, 0.5f, 0.5f, 0.5f }; - device_ctx->ClearRenderTargetView(nv12_UV_rt.get(), uv_black); + status = device->CreateRenderTargetView(output_texture.get(), &nv12_rt_desc, &nv12_UV_rt); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']'; + return -1; + } + + // Clear the RTVs to ensure the aspect ratio padding is black + const float y_black[] = { 0.0f, 0.0f, 0.0f, 0.0f }; + device_ctx->ClearRenderTargetView(nv12_Y_rt.get(), y_black); + const float uv_black[] = { 0.5f, 0.5f, 0.5f, 0.5f }; + device_ctx->ClearRenderTargetView(nv12_UV_rt.get(), uv_black); + rt_cleared = true; + } return 0; } @@ -556,61 +665,91 @@ namespace platf::dxgi { BOOST_LOG(warning) << "Failed to increase encoding GPU thread priority. Please run application as administrator for optimal performance."; } - format = (pix_fmt == pix_fmt_e::nv12 ? DXGI_FORMAT_NV12 : DXGI_FORMAT_P010); - status = device->CreateVertexShader(convert_yuv420_planar_y_vs_hlsl->GetBufferPointer(), convert_yuv420_planar_y_vs_hlsl->GetBufferSize(), nullptr, &scene_vs); - if (status) { - BOOST_LOG(error) << "Failed to create scene vertex shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; - } + switch (pix_fmt) { + case pix_fmt_e::nv12: + format = DXGI_FORMAT_NV12; + break; - status = device->CreateVertexShader(convert_yuv420_packed_uv_type0_vs_hlsl->GetBufferPointer(), convert_yuv420_packed_uv_type0_vs_hlsl->GetBufferSize(), nullptr, &convert_UV_vs); - if (status) { - BOOST_LOG(error) << "Failed to create convertUV vertex shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; + case pix_fmt_e::p010: + format = DXGI_FORMAT_P010; + break; + + case pix_fmt_e::ayuv: + format = DXGI_FORMAT_AYUV; + break; + + case pix_fmt_e::yuv444p16: + format = DXGI_FORMAT_R16_UINT; + break; + + case pix_fmt_e::y410: + format = DXGI_FORMAT_Y410; + break; + + default: + BOOST_LOG(error) << "DXGI backend doesn't support requested pixel format"; + return -1; } +#define create_vertex_shader_helper(x, y) \ + if ((status = device->CreateVertexShader(x->GetBufferPointer(), x->GetBufferSize(), nullptr, &y))) { \ + BOOST_LOG(error) << "Failed to create vertex shader " << #x << " [0x" << util::hex(status).to_string_view() << "]"; \ + return -1; \ + } +#define create_pixel_shader_helper(x, y) \ + if ((status = device->CreatePixelShader(x->GetBufferPointer(), x->GetBufferSize(), nullptr, &y))) { \ + BOOST_LOG(error) << "Failed to create pixel shader " << #x << " [0x" << util::hex(status).to_string_view() << "]"; \ + return -1; \ + } + + create_vertex_shader_helper(convert_yuv420_planar_y_vs_hlsl, scene_vs); + create_vertex_shader_helper(convert_yuv420_packed_uv_type0_vs_hlsl, convert_UV_vs); + create_pixel_shader_helper(convert_yuv420_planar_y_ps_hlsl, convert_Y_ps); + create_pixel_shader_helper(convert_yuv420_packed_uv_type0_ps_hlsl, convert_UV_ps); + // If the display is in HDR and we're streaming HDR, we'll be converting scRGB to SMPTE 2084 PQ. if (format == DXGI_FORMAT_P010 && display->is_hdr()) { - status = device->CreatePixelShader(convert_yuv420_planar_y_ps_perceptual_quantizer_hlsl->GetBufferPointer(), convert_yuv420_planar_y_ps_perceptual_quantizer_hlsl->GetBufferSize(), nullptr, &convert_Y_fp16_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertY pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; - } - - status = device->CreatePixelShader(convert_yuv420_packed_uv_type0_ps_perceptual_quantizer_hlsl->GetBufferPointer(), convert_yuv420_packed_uv_type0_ps_perceptual_quantizer_hlsl->GetBufferSize(), nullptr, &convert_UV_fp16_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertUV pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; - } + create_pixel_shader_helper(convert_yuv420_planar_y_ps_perceptual_quantizer_hlsl, convert_Y_fp16_ps); + create_pixel_shader_helper(convert_yuv420_packed_uv_type0_ps_perceptual_quantizer_hlsl, convert_UV_fp16_ps); } else { // If the display is in Advanced Color mode, the desktop format will be scRGB FP16. // scRGB uses linear gamma, so we must use our linear to sRGB conversion shaders. - status = device->CreatePixelShader(convert_yuv420_planar_y_ps_linear_hlsl->GetBufferPointer(), convert_yuv420_planar_y_ps_linear_hlsl->GetBufferSize(), nullptr, &convert_Y_fp16_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertY pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; - } + create_pixel_shader_helper(convert_yuv420_planar_y_ps_linear_hlsl, convert_Y_fp16_ps); + create_pixel_shader_helper(convert_yuv420_packed_uv_type0_ps_linear_hlsl, convert_UV_fp16_ps); + } - status = device->CreatePixelShader(convert_yuv420_packed_uv_type0_ps_linear_hlsl->GetBufferPointer(), convert_yuv420_packed_uv_type0_ps_linear_hlsl->GetBufferSize(), nullptr, &convert_UV_fp16_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertUV pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; + if (format == DXGI_FORMAT_R16_UINT) { + // Planar 16-bit YUV 4:4:4, 10 most significant bits store the value + create_vertex_shader_helper(convert_yuv444_planar_vs_hlsl, convert_YUV_vs); + create_pixel_shader_helper(convert_yuv444_planar_ps_hlsl, convert_YUV_ps); + if (display->is_hdr()) { + create_pixel_shader_helper(convert_yuv444_planar_ps_perceptual_quantizer_hlsl, convert_YUV_fp16_ps); + } + else { + create_pixel_shader_helper(convert_yuv444_planar_ps_linear_hlsl, convert_YUV_fp16_ps); } } - - // These shaders consume standard 8-bit sRGB input - status = device->CreatePixelShader(convert_yuv420_planar_y_ps_hlsl->GetBufferPointer(), convert_yuv420_planar_y_ps_hlsl->GetBufferSize(), nullptr, &convert_Y_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertY pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; + else if (format == DXGI_FORMAT_AYUV) { + // Packed 8-bit YUV 4:4:4 + create_vertex_shader_helper(convert_yuv444_packed_vs_hlsl, convert_YUV_vs); + create_pixel_shader_helper(convert_yuv444_packed_ayuv_ps_hlsl, convert_YUV_ps); + create_pixel_shader_helper(convert_yuv444_packed_ayuv_ps_linear_hlsl, convert_YUV_fp16_ps); + } + else if (format == DXGI_FORMAT_Y410) { + // Packed 10-bit YUV 4:4:4 + create_vertex_shader_helper(convert_yuv444_packed_vs_hlsl, convert_YUV_vs); + create_pixel_shader_helper(convert_yuv444_packed_y410_ps_hlsl, convert_YUV_ps); + if (display->is_hdr()) { + create_pixel_shader_helper(convert_yuv444_packed_y410_ps_perceptual_quantizer_hlsl, convert_YUV_fp16_ps); + } + else { + create_pixel_shader_helper(convert_yuv444_packed_y410_ps_linear_hlsl, convert_YUV_fp16_ps); + } } - status = device->CreatePixelShader(convert_yuv420_packed_uv_type0_ps_hlsl->GetBufferPointer(), convert_yuv420_packed_uv_type0_ps_hlsl->GetBufferSize(), nullptr, &convert_UV_ps); - if (status) { - BOOST_LOG(error) << "Failed to create convertUV pixel shader [0x"sv << util::hex(status).to_string_view() << ']'; - return -1; - } +#undef create_vertex_shader_helper +#undef create_pixel_shader_helper auto default_color_vectors = ::video::color_vectors_from_colorspace(::video::colorspace_e::rec601, false); if (!default_color_vectors) { @@ -623,6 +762,7 @@ namespace platf::dxgi { BOOST_LOG(error) << "Failed to create color matrix buffer"sv; return -1; } + device_ctx->VSSetConstantBuffers(3, 1, &color_matrix); device_ctx->PSSetConstantBuffers(0, 1, &color_matrix); this->display = std::dynamic_pointer_cast(display); @@ -653,7 +793,7 @@ namespace platf::dxgi { device_ctx->OMSetBlendState(blend_disable.get(), nullptr, 0xFFFFFFFFu); device_ctx->PSSetSamplers(0, 1, &sampler_linear); - device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); + device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); return 0; } @@ -725,6 +865,41 @@ namespace platf::dxgi { return 0; } + shader_res_t + create_black_texture_for_render_target_clear() { + constexpr auto width = 32; + constexpr auto height = 32; + + D3D11_TEXTURE2D_DESC texture_desc = {}; + texture_desc.Width = width; + texture_desc.Height = height; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.SampleDesc.Count = 1; + texture_desc.Usage = D3D11_USAGE_IMMUTABLE; + texture_desc.Format = DXGI_FORMAT_B8G8R8A8_UNORM; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + std::array mem = {}; + D3D11_SUBRESOURCE_DATA texture_data = { mem.data(), 4 * width, 0 }; + + texture2d_t texture; + auto status = device->CreateTexture2D(&texture_desc, &texture_data, &texture); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create black texture [0x" << util::hex(status).to_string_view() << "]"; + return {}; + } + + shader_res_t resource_view; + status = device->CreateShaderResourceView(texture.get(), nullptr, &resource_view); + if (FAILED(status)) { + BOOST_LOG(error) << "Failed to create black texture resource view [0x" << util::hex(status).to_string_view() << "]"; + return {}; + } + + return resource_view; + } + ::video::color_t *color_p; buf_t subsample_offset; @@ -733,8 +908,10 @@ namespace platf::dxgi { blend_t blend_disable; sampler_state_t sampler_linear; + render_target_t yuv444_rt; render_target_t nv12_Y_rt; render_target_t nv12_UV_rt; + bool rt_cleared = false; // d3d_img_t::id -> encoder_img_ctx_t // These store the encoder textures for each img_t that passes through @@ -749,10 +926,14 @@ namespace platf::dxgi { ps_t convert_UV_fp16_ps; ps_t convert_Y_ps; ps_t convert_Y_fp16_ps; + vs_t convert_YUV_vs; + ps_t convert_YUV_ps; + ps_t convert_YUV_fp16_ps; vs_t scene_vs; - D3D11_VIEWPORT outY_view; + D3D11_VIEWPORT outY_view, outY_view_for_clear; D3D11_VIEWPORT outUV_view; + std::array outYUV_views, outYUV_views_for_clear; DXGI_FORMAT format; @@ -871,7 +1052,12 @@ namespace platf::dxgi { if (base.init(display, adapter_p, pix_fmt)) return false; - nvenc_d3d = std::make_unique(base.device.get()); + if (pix_fmt == pix_fmt_e::yuv444p16) { + nvenc_d3d = std::make_unique(base.device.get()); + } + else { + nvenc_d3d = std::make_unique(base.device.get()); + } nvenc = nvenc_d3d.get(); return true; @@ -1405,7 +1591,7 @@ namespace platf::dxgi { device_ctx->OMSetBlendState(blend_disable.get(), nullptr, 0xFFFFFFFFu); device_ctx->PSSetSamplers(0, 1, &sampler_linear); - device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); + device_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); return 0; } @@ -1665,6 +1851,16 @@ namespace platf::dxgi { compile_pixel_shader_helper(convert_yuv420_planar_y_ps_linear); compile_pixel_shader_helper(convert_yuv420_planar_y_ps_perceptual_quantizer); compile_vertex_shader_helper(convert_yuv420_planar_y_vs); + compile_pixel_shader_helper(convert_yuv444_packed_ayuv_ps); + compile_pixel_shader_helper(convert_yuv444_packed_ayuv_ps_linear); + compile_vertex_shader_helper(convert_yuv444_packed_vs); + compile_pixel_shader_helper(convert_yuv444_planar_ps); + compile_pixel_shader_helper(convert_yuv444_planar_ps_linear); + compile_pixel_shader_helper(convert_yuv444_planar_ps_perceptual_quantizer); + compile_pixel_shader_helper(convert_yuv444_packed_y410_ps); + compile_pixel_shader_helper(convert_yuv444_packed_y410_ps_linear); + compile_pixel_shader_helper(convert_yuv444_packed_y410_ps_perceptual_quantizer); + compile_vertex_shader_helper(convert_yuv444_planar_vs); compile_pixel_shader_helper(cursor_ps); compile_pixel_shader_helper(cursor_ps_normalize_white); compile_vertex_shader_helper(cursor_vs); diff --git a/src/rtsp.cpp b/src/rtsp.cpp index 99b9f0de8f8..d9e8b85c80a 100644 --- a/src/rtsp.cpp +++ b/src/rtsp.cpp @@ -978,6 +978,7 @@ namespace rtsp_stream { args.try_emplace("x-nv-aqos.qosTrafficType"sv, "4"sv); args.try_emplace("x-ml-video.configuredBitrateKbps"sv, "0"sv); args.try_emplace("x-ss-general.encryptionEnabled"sv, "0"sv); + args.try_emplace("x-ss-video[0].chromaSamplingType"sv, "0"sv); stream::config_t config; @@ -1013,6 +1014,7 @@ namespace rtsp_stream { config.monitor.encoderCscMode = util::from_view(args.at("x-nv-video[0].encoderCscMode"sv)); config.monitor.videoFormat = util::from_view(args.at("x-nv-vqos[0].bitStreamFormat"sv)); config.monitor.dynamicRange = util::from_view(args.at("x-nv-video[0].dynamicRangeMode"sv)); + config.monitor.chromaSamplingType = util::from_view(args.at("x-ss-video[0].chromaSamplingType"sv)); configuredBitrateKbps = util::from_view(args.at("x-ml-video.configuredBitrateKbps"sv)); } diff --git a/src/video.cpp b/src/video.cpp index 12f8cb41729..0c40abf29d8 100644 --- a/src/video.cpp +++ b/src/video.cpp @@ -274,6 +274,7 @@ namespace video { NO_RC_BUF_LIMIT = 1 << 7, // Don't set rc_buffer_size REF_FRAMES_INVALIDATION = 1 << 8, // Support reference frames invalidation ALWAYS_REPROBE = 1 << 9, // This is an encoder of last resort and we want to aggressively probe for a better one + YUV444_SUPPORT = 1 << 10, // Encoder may support 4:4:4 chroma sampling depending on hardware }; class avcodec_encode_session_t: public encode_session_t { @@ -447,7 +448,8 @@ namespace video { "nvenc"sv, std::make_unique( platf::mem_type_e::dxgi, - platf::pix_fmt_e::nv12, platf::pix_fmt_e::p010), + platf::pix_fmt_e::nv12, platf::pix_fmt_e::p010, + platf::pix_fmt_e::ayuv, platf::pix_fmt_e::yuv444p16), { // Common options {}, @@ -484,7 +486,7 @@ namespace video { std::nullopt, // QP rate control fallback "h264_nvenc"s, }, - PARALLEL_ENCODING | REF_FRAMES_INVALIDATION // flags + PARALLEL_ENCODING | REF_FRAMES_INVALIDATION | YUV444_SUPPORT // flags }; #elif !defined(__APPLE__) encoder_t nvenc { @@ -498,6 +500,7 @@ namespace video { AV_PIX_FMT_CUDA, #endif AV_PIX_FMT_NV12, AV_PIX_FMT_P010, + AV_PIX_FMT_NONE, AV_PIX_FMT_NONE, #ifdef _WIN32 dxgi_init_avcodec_hardware_input_buffer #else @@ -581,6 +584,7 @@ namespace video { AV_HWDEVICE_TYPE_D3D11VA, AV_HWDEVICE_TYPE_QSV, AV_PIX_FMT_QSV, AV_PIX_FMT_NV12, AV_PIX_FMT_P010, + AV_PIX_FMT_VUYX, AV_PIX_FMT_XV30, dxgi_init_avcodec_hardware_input_buffer), { // Common options @@ -653,7 +657,7 @@ namespace video { std::nullopt, // QP rate control fallback "h264_qsv"s, }, - PARALLEL_ENCODING | CBR_WITH_VBR | RELAXED_COMPLIANCE | NO_RC_BUF_LIMIT + PARALLEL_ENCODING | CBR_WITH_VBR | RELAXED_COMPLIANCE | NO_RC_BUF_LIMIT | YUV444_SUPPORT }; encoder_t amdvce { @@ -662,6 +666,7 @@ namespace video { AV_HWDEVICE_TYPE_D3D11VA, AV_HWDEVICE_TYPE_NONE, AV_PIX_FMT_D3D11, AV_PIX_FMT_NV12, AV_PIX_FMT_P010, + AV_PIX_FMT_NONE, AV_PIX_FMT_NONE, dxgi_init_avcodec_hardware_input_buffer), { // Common options @@ -737,6 +742,7 @@ namespace video { AV_HWDEVICE_TYPE_NONE, AV_HWDEVICE_TYPE_NONE, AV_PIX_FMT_NONE, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, + AV_PIX_FMT_NONE, AV_PIX_FMT_NONE, nullptr), { // libsvtav1 takes different presets than libx264/libx265. @@ -802,6 +808,7 @@ namespace video { AV_HWDEVICE_TYPE_VAAPI, AV_HWDEVICE_TYPE_NONE, AV_PIX_FMT_VAAPI, AV_PIX_FMT_NV12, AV_PIX_FMT_P010, + AV_PIX_FMT_NONE, AV_PIX_FMT_NONE, vaapi_init_avcodec_hardware_input_buffer), { // Common options @@ -870,6 +877,7 @@ namespace video { AV_HWDEVICE_TYPE_VIDEOTOOLBOX, AV_HWDEVICE_TYPE_NONE, AV_PIX_FMT_VIDEOTOOLBOX, AV_PIX_FMT_NV12, AV_PIX_FMT_P010, + AV_PIX_FMT_NONE, AV_PIX_FMT_NONE, vt_init_avcodec_hardware_input_buffer), { // Common options @@ -1393,6 +1401,11 @@ namespace video { return nullptr; } + if (config.chromaSamplingType > 0 && !video_format[encoder_t::YUV444]) { + BOOST_LOG(error) << video_format.name << ": YUV 4:4:4 not supported"sv; + return nullptr; + } + auto codec = avcodec_find_encoder_by_name(video_format.name.c_str()); if (!codec) { BOOST_LOG(error) << "Couldn't open ["sv << video_format.name << ']'; @@ -1418,16 +1431,25 @@ namespace video { switch (config.videoFormat) { case 0: - ctx->profile = FF_PROFILE_H264_HIGH; + // 10-bit h264 encoding is not supported by our streaming protocol + assert(!config.dynamicRange); + ctx->profile = (config.chromaSamplingType == 1) ? FF_PROFILE_H264_HIGH_444 : FF_PROFILE_H264_HIGH; break; case 1: - ctx->profile = config.dynamicRange ? FF_PROFILE_HEVC_MAIN_10 : FF_PROFILE_HEVC_MAIN; + if (config.chromaSamplingType == 1) { + // HEVC uses the same RExt profile for both 8 and 10 bit YUV 4:4:4 encoding + ctx->profile = FF_PROFILE_HEVC_REXT; + } + else { + ctx->profile = config.dynamicRange ? FF_PROFILE_HEVC_MAIN_10 : FF_PROFILE_HEVC_MAIN; + } break; case 2: // AV1 supports both 8 and 10 bit encoding with the same Main profile - ctx->profile = FF_PROFILE_AV1_MAIN; + // but YUV 4:4:4 sampling requires High profile + ctx->profile = (config.chromaSamplingType == 1) ? FF_PROFILE_AV1_HIGH : FF_PROFILE_AV1_MAIN; break; } @@ -1865,7 +1887,24 @@ namespace video { std::unique_ptr result; auto colorspace = colorspace_from_client_config(config, disp.is_hdr()); - auto pix_fmt = (colorspace.bit_depth == 10) ? encoder.platform_formats->pix_fmt_10bit : encoder.platform_formats->pix_fmt_8bit; + + platf::pix_fmt_e pix_fmt; + if (config.chromaSamplingType == 1) { + // YUV 4:4:4 + if (!(encoder.flags & YUV444_SUPPORT)) { + // Encoder can't support YUV 4:4:4 regardless of hardware capabilities + return {}; + } + pix_fmt = (colorspace.bit_depth == 10) ? + encoder.platform_formats->pix_fmt_yuv444_10bit : + encoder.platform_formats->pix_fmt_yuv444_8bit; + } + else { + // YUV 4:2:0 + pix_fmt = (colorspace.bit_depth == 10) ? + encoder.platform_formats->pix_fmt_10bit : + encoder.platform_formats->pix_fmt_8bit; + } if (dynamic_cast(encoder.platform_formats.get())) { result = disp.make_avcodec_encode_device(pix_fmt); @@ -2290,8 +2329,8 @@ namespace video { encoder.av1.capabilities.set(); // First, test encoder viability - config_t config_max_ref_frames { 1920, 1080, 60, 1000, 1, 1, 1, 0, 0 }; - config_t config_autoselect { 1920, 1080, 60, 1000, 1, 0, 1, 0, 0 }; + config_t config_max_ref_frames { 1920, 1080, 60, 1000, 1, 1, 1, 0, 0, 0 }; + config_t config_autoselect { 1920, 1080, 60, 1000, 1, 0, 1, 0, 0, 0 }; // If the encoder isn't supported at all (not even H.264), bail early reset_display(disp, encoder.platform_formats->dev_type, config::video.output_name, config_autoselect); @@ -2410,35 +2449,48 @@ namespace video { encoder.av1.capabilities.reset(); } - std::vector> configs { - { encoder_t::DYNAMIC_RANGE, { 1920, 1080, 60, 1000, 1, 0, 3, 1, 1 } }, - }; + // Test HDR and YUV444 support + { + const config_t generic_hdr_config = { 1920, 1080, 60, 1000, 1, 0, 3, 1, 1, 0 }; + + auto test_hdr_and_yuv444 = [&](auto &flag_map, auto video_format) { + auto config = generic_hdr_config; + config.videoFormat = video_format; - for (auto &[flag, config] : configs) { - auto h264 = config; - auto hevc = config; - auto av1 = config; + flag_map[encoder_t::DYNAMIC_RANGE] = false; + flag_map[encoder_t::YUV444] = false; - h264.videoFormat = 0; - hevc.videoFormat = 1; - av1.videoFormat = 2; + if (!flag_map[encoder_t::PASSED]) return; + + // Test 4:4:4 HDR first. If 4:4:4 is supported, 4:2:0 should also be supported. + if (encoder.flags & YUV444_SUPPORT) { + config.chromaSamplingType = 1; + if (validate_config(disp, encoder, config) >= 0) { + flag_map[encoder_t::DYNAMIC_RANGE] = true; + flag_map[encoder_t::YUV444] = true; + return; + } + } + + // Test 4:2:0 HDR + config.chromaSamplingType = 0; + if (validate_config(disp, encoder, config) >= 0) { + flag_map[encoder_t::DYNAMIC_RANGE] = true; + } + }; // Reset the display since we're switching from SDR to HDR - reset_display(disp, encoder.platform_formats->dev_type, config::video.output_name, config); + reset_display(disp, encoder.platform_formats->dev_type, config::video.output_name, generic_hdr_config); if (!disp) { return false; } // HDR is not supported with H.264. Don't bother even trying it. - encoder.h264[flag] = flag != encoder_t::DYNAMIC_RANGE && validate_config(disp, encoder, h264) >= 0; + encoder.h264[encoder_t::DYNAMIC_RANGE] = false; + encoder.h264[encoder_t::YUV444] = false; - if (encoder.hevc[encoder_t::PASSED]) { - encoder.hevc[flag] = validate_config(disp, encoder, hevc) >= 0; - } - - if (encoder.av1[encoder_t::PASSED]) { - encoder.av1[flag] = validate_config(disp, encoder, av1) >= 0; - } + test_hdr_and_yuv444(encoder.hevc, 1); + test_hdr_and_yuv444(encoder.av1, 2); } encoder.h264[encoder_t::VUI_PARAMETERS] = encoder.h264[encoder_t::VUI_PARAMETERS] && !config::sunshine.flags[config::flag::FORCE_VIDEO_HEADER_REPLACE]; @@ -2547,8 +2599,8 @@ namespace video { } // Skip it if it doesn't support HDR on the specified codec - if ((active_hevc_mode == 3 && !encoder->hevc[encoder_t::DYNAMIC_RANGE]) || - (active_av1_mode == 3 && !encoder->av1[encoder_t::DYNAMIC_RANGE])) { + if ((active_hevc_mode >= 3 && !encoder->hevc[encoder_t::DYNAMIC_RANGE]) || + (active_av1_mode >= 3 && !encoder->av1[encoder_t::DYNAMIC_RANGE])) { pos++; continue; } @@ -2633,12 +2685,16 @@ namespace video { BOOST_LOG(info) << "Found AV1 encoder: "sv << encoder.av1.name << " ["sv << encoder.name << ']'; } - if (active_hevc_mode == 0) { - active_hevc_mode = encoder.hevc[encoder_t::PASSED] ? (encoder.hevc[encoder_t::DYNAMIC_RANGE] ? 3 : 2) : 1; - } + { + auto active_mode_from_flags = [&](const auto &flags_map) { + if (!flags_map[encoder_t::PASSED]) return 1; + if (flags_map[encoder_t::DYNAMIC_RANGE] && flags_map[encoder_t::YUV444]) return 4; + if (flags_map[encoder_t::DYNAMIC_RANGE]) return 3; + return 2; + }; - if (active_av1_mode == 0) { - active_av1_mode = encoder.av1[encoder_t::PASSED] ? (encoder.av1[encoder_t::DYNAMIC_RANGE] ? 3 : 2) : 1; + if (active_hevc_mode == 0) active_hevc_mode = active_mode_from_flags(encoder.hevc); + if (active_av1_mode == 0) active_av1_mode = active_mode_from_flags(encoder.av1); } return 0; @@ -2790,6 +2846,10 @@ namespace video { platf::pix_fmt_e map_pix_fmt(AVPixelFormat fmt) { switch (fmt) { + case AV_PIX_FMT_VUYX: + return platf::pix_fmt_e::ayuv; + case AV_PIX_FMT_XV30: + return platf::pix_fmt_e::y410; case AV_PIX_FMT_YUV420P10: return platf::pix_fmt_e::yuv420p10; case AV_PIX_FMT_YUV420P: diff --git a/src/video.h b/src/video.h index ba80474669f..31288d65fe8 100644 --- a/src/video.h +++ b/src/video.h @@ -39,6 +39,7 @@ namespace video { virtual ~encoder_platform_formats_t() = default; platf::mem_type_e dev_type; platf::pix_fmt_e pix_fmt_8bit, pix_fmt_10bit; + platf::pix_fmt_e pix_fmt_yuv444_8bit, pix_fmt_yuv444_10bit; }; struct encoder_platform_formats_avcodec: encoder_platform_formats_t { @@ -50,21 +51,28 @@ namespace video { const AVPixelFormat &avcodec_dev_pix_fmt, const AVPixelFormat &avcodec_pix_fmt_8bit, const AVPixelFormat &avcodec_pix_fmt_10bit, + const AVPixelFormat &avcodec_pix_fmt_yuv444_8bit, + const AVPixelFormat &avcodec_pix_fmt_yuv444_10bit, const init_buffer_function_t &init_avcodec_hardware_input_buffer_function): avcodec_base_dev_type { avcodec_base_dev_type }, avcodec_derived_dev_type { avcodec_derived_dev_type }, avcodec_dev_pix_fmt { avcodec_dev_pix_fmt }, avcodec_pix_fmt_8bit { avcodec_pix_fmt_8bit }, avcodec_pix_fmt_10bit { avcodec_pix_fmt_10bit }, + avcodec_pix_fmt_yuv444_8bit { avcodec_pix_fmt_yuv444_8bit }, + avcodec_pix_fmt_yuv444_10bit { avcodec_pix_fmt_yuv444_10bit }, init_avcodec_hardware_input_buffer { init_avcodec_hardware_input_buffer_function } { dev_type = map_base_dev_type(avcodec_base_dev_type); pix_fmt_8bit = map_pix_fmt(avcodec_pix_fmt_8bit); pix_fmt_10bit = map_pix_fmt(avcodec_pix_fmt_10bit); + pix_fmt_yuv444_8bit = map_pix_fmt(avcodec_pix_fmt_yuv444_8bit); + pix_fmt_yuv444_10bit = map_pix_fmt(avcodec_pix_fmt_yuv444_10bit); } AVHWDeviceType avcodec_base_dev_type, avcodec_derived_dev_type; AVPixelFormat avcodec_dev_pix_fmt; AVPixelFormat avcodec_pix_fmt_8bit, avcodec_pix_fmt_10bit; + AVPixelFormat avcodec_pix_fmt_yuv444_8bit, avcodec_pix_fmt_yuv444_10bit; init_buffer_function_t init_avcodec_hardware_input_buffer; }; @@ -73,10 +81,14 @@ namespace video { encoder_platform_formats_nvenc( const platf::mem_type_e &dev_type, const platf::pix_fmt_e &pix_fmt_8bit, - const platf::pix_fmt_e &pix_fmt_10bit) { + const platf::pix_fmt_e &pix_fmt_10bit, + const platf::pix_fmt_e &pix_fmt_yuv444_8bit, + const platf::pix_fmt_e &pix_fmt_yuv444_10bit) { encoder_platform_formats_t::dev_type = dev_type; encoder_platform_formats_t::pix_fmt_8bit = pix_fmt_8bit; encoder_platform_formats_t::pix_fmt_10bit = pix_fmt_10bit; + encoder_platform_formats_t::pix_fmt_yuv444_8bit = pix_fmt_yuv444_8bit; + encoder_platform_formats_t::pix_fmt_yuv444_10bit = pix_fmt_yuv444_10bit; } }; @@ -87,6 +99,7 @@ namespace video { REF_FRAMES_RESTRICT, // Set maximum reference frames CBR, // Some encoders don't support CBR, if not supported --> attempt constant quantatication parameter instead DYNAMIC_RANGE, // hdr + YUV444, // 4:4:4 VUI_PARAMETERS, // AMD encoder with VAAPI doesn't add VUI parameters to SPS MAX_FLAGS }; @@ -101,6 +114,7 @@ namespace video { _CONVERT(REF_FRAMES_RESTRICT); _CONVERT(CBR); _CONVERT(DYNAMIC_RANGE); + _CONVERT(YUV444); _CONVERT(VUI_PARAMETERS); _CONVERT(MAX_FLAGS); } @@ -312,6 +326,8 @@ namespace video { /* Encoding color depth (bit depth): 0 - 8-bit, 1 - 10-bit HDR encoding activates when color depth is higher than 8-bit and the display which is being captured is operating in HDR mode */ int dynamicRange; + + int chromaSamplingType; // 0 - 4:2:0, 1 - 4:4:4 }; extern int active_hevc_mode; diff --git a/src/video_colorspace.cpp b/src/video_colorspace.cpp index 4f5955eed7e..d64d66c05ae 100644 --- a/src/video_colorspace.cpp +++ b/src/video_colorspace.cpp @@ -178,4 +178,28 @@ namespace video { return result; } + color_t + new_color_vectors_from_colorspace(const sunshine_colorspace_t &colorspace) { + color_t color_vectors = *color_vectors_from_colorspace(colorspace); + + color_vectors.color_vec_y[3] = 0; + color_vectors.color_vec_u[3] = 0; + color_vectors.color_vec_v[3] = 0; + + // ITU-T Recommendation H.264 (08/21), Annex E + if (colorspace.full_range) { + color_vectors.range_y[0] = (1 << colorspace.bit_depth) - 1; + color_vectors.range_y[1] = 0.5f; + color_vectors.range_uv[0] = (1 << colorspace.bit_depth) - 1; + color_vectors.range_uv[1] = (1 << (colorspace.bit_depth - 1)) + 0.5f; + } + else { + color_vectors.range_y[0] = (1 << (colorspace.bit_depth - 8)) * 219; + color_vectors.range_y[1] = (1 << (colorspace.bit_depth - 8)) * 16 + 0.5f; + color_vectors.range_uv[0] = (1 << (colorspace.bit_depth - 8)) * 224; + color_vectors.range_uv[1] = (1 << (colorspace.bit_depth - 8)) * 128 + 0.5f; + } + + return color_vectors; + } } // namespace video diff --git a/src/video_colorspace.h b/src/video_colorspace.h index 858914ce6ed..417c49a856f 100644 --- a/src/video_colorspace.h +++ b/src/video_colorspace.h @@ -53,4 +53,6 @@ namespace video { const color_t * color_vectors_from_colorspace(colorspace_e colorspace, bool full_range); + color_t + new_color_vectors_from_colorspace(const sunshine_colorspace_t &colorspace); } // namespace video diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps.hlsl new file mode 100644 index 00000000000..73c45e9b9cd --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps.hlsl @@ -0,0 +1,3 @@ +#include "include/convert_base.hlsl" + +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps_linear.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps_linear.hlsl new file mode 100644 index 00000000000..820e5128f67 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_ayuv_ps_linear.hlsl @@ -0,0 +1,3 @@ +#include "include/convert_linear_base.hlsl" + +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_vs.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_vs.hlsl new file mode 100644 index 00000000000..33e481453ed --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_vs.hlsl @@ -0,0 +1,10 @@ +cbuffer rotate_texture_steps_cbuffer : register(b1) { + int rotate_texture_steps; +}; + +#include "include/base_vs.hlsl" + +vertex_t main_vs(uint vertex_id : SV_VertexID) +{ + return generate_fullscreen_triangle_vertex(vertex_id, rotate_texture_steps); +} diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps.hlsl new file mode 100644 index 00000000000..b84c661783b --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_base.hlsl" + +#define Y410 +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_linear.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_linear.hlsl new file mode 100644 index 00000000000..f7dbbcb6ee8 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_linear.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_linear_base.hlsl" + +#define Y410 +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_perceptual_quantizer.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_perceptual_quantizer.hlsl new file mode 100644 index 00000000000..1682be7b632 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_packed_y410_ps_perceptual_quantizer.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_perceptual_quantizer_base.hlsl" + +#define Y410 +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps.hlsl new file mode 100644 index 00000000000..d6cca979e37 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_base.hlsl" + +#define PLANAR_VIEWPORTS +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_linear.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_linear.hlsl new file mode 100644 index 00000000000..46032651e40 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_linear.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_linear_base.hlsl" + +#define PLANAR_VIEWPORTS +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_perceptual_quantizer.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_perceptual_quantizer.hlsl new file mode 100644 index 00000000000..d390e81eb28 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_ps_perceptual_quantizer.hlsl @@ -0,0 +1,4 @@ +#include "include/convert_perceptual_quantizer_base.hlsl" + +#define PLANAR_VIEWPORTS +#include "include/convert_yuv444_ps_base.hlsl" diff --git a/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_vs.hlsl b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_vs.hlsl new file mode 100644 index 00000000000..c001627ea7c --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/convert_yuv444_planar_vs.hlsl @@ -0,0 +1,36 @@ +cbuffer rotate_texture_steps_cbuffer : register(b1) { + int rotate_texture_steps; +}; + +cbuffer color_matrix_cbuffer : register(b3) { + float4 color_vec_y; + float4 color_vec_u; + float4 color_vec_v; + float2 range_y; + float2 range_uv; +}; + +#define PLANAR_VIEWPORTS +#include "include/base_vs.hlsl" + +vertex_t main_vs(uint vertex_id : SV_VertexID) +{ + vertex_t output = generate_fullscreen_triangle_vertex(vertex_id % 3, rotate_texture_steps); + + output.viewport = vertex_id / 3; + + if (output.viewport == 0) { + output.color_vec = color_vec_y; + output.range = range_y; + } + else if (output.viewport == 1) { + output.color_vec = color_vec_u; + output.range = range_uv; + } + else { + output.color_vec = color_vec_v; + output.range = range_uv; + } + + return output; +} diff --git a/src_assets/windows/assets/shaders/directx/include/base_vs.hlsl b/src_assets/windows/assets/shaders/directx/include/base_vs.hlsl index c04fad39018..c39e7c6f80b 100644 --- a/src_assets/windows/assets/shaders/directx/include/base_vs.hlsl +++ b/src_assets/windows/assets/shaders/directx/include/base_vs.hlsl @@ -19,7 +19,7 @@ vertex_t generate_fullscreen_triangle_vertex(uint vertex_id, int rotate_texture_ output.viewpoint_pos = float4(-1, 3, 0, 1); tex_coord = float2(0, -1); } - else if (vertex_id == 2) { + else { output.viewpoint_pos = float4(3, -1, 0, 1); tex_coord = float2(2, 1); } diff --git a/src_assets/windows/assets/shaders/directx/include/base_vs_types.hlsl b/src_assets/windows/assets/shaders/directx/include/base_vs_types.hlsl index 9e4b28f18fb..958762d6b68 100644 --- a/src_assets/windows/assets/shaders/directx/include/base_vs_types.hlsl +++ b/src_assets/windows/assets/shaders/directx/include/base_vs_types.hlsl @@ -9,4 +9,9 @@ struct vertex_t #else float2 tex_coord : TEXCOORD; #endif +#ifdef PLANAR_VIEWPORTS + uint viewport : SV_ViewportArrayIndex; + nointerpolation float4 color_vec : COLOR0; + nointerpolation float2 range : COLOR1; +#endif }; diff --git a/src_assets/windows/assets/shaders/directx/include/convert_yuv444_ps_base.hlsl b/src_assets/windows/assets/shaders/directx/include/convert_yuv444_ps_base.hlsl new file mode 100644 index 00000000000..d0cf2f53294 --- /dev/null +++ b/src_assets/windows/assets/shaders/directx/include/convert_yuv444_ps_base.hlsl @@ -0,0 +1,45 @@ +Texture2D image : register(t0); +SamplerState def_sampler : register(s0); + +#ifndef PLANAR_VIEWPORTS +cbuffer color_matrix_cbuffer : register(b0) { + float4 color_vec_y; + float4 color_vec_u; + float4 color_vec_v; + float2 range_y; + float2 range_uv; +}; +#endif + +#include "include/base_vs_types.hlsl" + +#ifdef PLANAR_VIEWPORTS +uint main_ps(vertex_t input) : SV_Target +#else +uint4 main_ps(vertex_t input) : SV_Target +#endif +{ + float3 rgb = CONVERT_FUNCTION(image.Sample(def_sampler, input.tex_coord, 0).rgb); + +#ifdef PLANAR_VIEWPORTS + // Planar R16, 10 most significant bits store the value + return uint(dot(input.color_vec.xyz, rgb) * input.range.x + input.range.y) << 6; +#else + float y = dot(color_vec_y.xyz, rgb); + float u = dot(color_vec_u.xyz, rgb); + float v = dot(color_vec_v.xyz, rgb); + +#ifdef Y410 + return uint4(u * range_uv.x + range_uv.y, + y * range_y.x + range_y.y, + v * range_uv.x + range_uv.y, + 0); +#else + // AYUV + return uint4(v * range_uv.x + range_uv.y, + u * range_uv.x + range_uv.y, + y * range_y.x + range_y.y, + 0); +#endif +#endif +} diff --git a/third-party/moonlight-common-c b/third-party/moonlight-common-c index cbd0ec1b25e..3010f873f58 160000 --- a/third-party/moonlight-common-c +++ b/third-party/moonlight-common-c @@ -1 +1 @@ -Subproject commit cbd0ec1b25edfb8ee8645fffa49ff95b6e04c70e +Subproject commit 3010f873f589721bb761ea95fb6cc7ff34f4e024