diff --git a/include/vkd3d.h b/include/vkd3d.h index 381968e124..904c5b53e3 100644 --- a/include/vkd3d.h +++ b/include/vkd3d.h @@ -101,6 +101,7 @@ extern "C" { #define VKD3D_CONFIG_FLAG_DISABLE_UAV_COMPRESSION (1ull << 43) #define VKD3D_CONFIG_FLAG_DISABLE_DEPTH_COMPRESSION (1ull << 44) #define VKD3D_CONFIG_FLAG_DISABLE_COLOR_COMPRESSION (1ull << 45) +#define VKD3D_CONFIG_FLAG_DESCRIPTOR_HOISTING (1ull << 46) struct vkd3d_instance; diff --git a/include/vkd3d_shader.h b/include/vkd3d_shader.h index 9066814abf..9481012d6a 100644 --- a/include/vkd3d_shader.h +++ b/include/vkd3d_shader.h @@ -77,6 +77,17 @@ enum vkd3d_shader_meta_flags VKD3D_SHADER_META_FLAG_FORCE_COMPUTE_BARRIER_AFTER_DISPATCH = 1 << 16, }; +/* Tweaked to make the meta struct a nice and round 64 bytes. */ +#define VKD3D_MAX_HOISTED_DESCRIPTORS 15 +#define VKD3D_MAX_HOISTED_DESCRIPTOR_CONSTANT_OFFSET (1 << 7) + +struct vkd3d_shader_meta_hoisted_desc +{ + uint16_t table_index : 6; /* API limit */ + uint16_t constant_offset : 7; /* 128 offset from table should be more than enough ... */ + uint16_t vk_descriptor_type : 3; /* The standard descriptor types we care about fit into 3 bits as-is. */ +}; + struct vkd3d_shader_meta { vkd3d_shader_hash_t hash; @@ -84,8 +95,10 @@ struct vkd3d_shader_meta unsigned int patch_vertex_count; /* Relevant for HS. May be 0, in which case the patch vertex count is not known. */ unsigned int cs_required_wave_size; /* If non-zero, force a specific CS subgroup size. */ uint32_t flags; /* vkd3d_shader_meta_flags */ + struct vkd3d_shader_meta_hoisted_desc hoist_desc[VKD3D_MAX_HOISTED_DESCRIPTORS]; + uint16_t num_hoisted_descriptors; }; -STATIC_ASSERT(sizeof(struct vkd3d_shader_meta) == 32); +STATIC_ASSERT(sizeof(struct vkd3d_shader_meta) == 64); struct vkd3d_shader_code { @@ -220,6 +233,7 @@ enum vkd3d_shader_interface_flag VKD3D_SHADER_INTERFACE_DESCRIPTOR_QA_BUFFER = 0x00000010u, /* In this model, use descriptor_size_cbv_srv_uav as array stride for raw VA buffer. */ VKD3D_SHADER_INTERFACE_RAW_VA_ALIAS_DESCRIPTOR_BUFFER = 0x00000020u, + VKD3D_SHADER_INTERFACE_HOIST_DESCRIPTORS = 0x00000040u, }; struct vkd3d_shader_stage_io_entry @@ -278,6 +292,7 @@ struct vkd3d_shader_interface_info /* Used for either VKD3D_SHADER_INTERFACE_RAW_VA_ALIAS_DESCRIPTOR_BUFFER or local root signatures. */ uint32_t descriptor_size_cbv_srv_uav; uint32_t descriptor_size_sampler; + uint32_t hoist_descriptor_set_index; }; struct vkd3d_shader_descriptor_table diff --git a/libs/vkd3d-shader/dxil.c b/libs/vkd3d-shader/dxil.c index dd2f44234a..73877396c2 100644 --- a/libs/vkd3d-shader/dxil.c +++ b/libs/vkd3d-shader/dxil.c @@ -123,6 +123,7 @@ struct vkd3d_dxil_remap_userdata const struct vkd3d_shader_interface_info *shader_interface_info; const struct vkd3d_shader_interface_local_info *shader_interface_local_info; unsigned int num_root_descriptors; + struct vkd3d_shader_meta *meta; }; struct vkd3d_dxil_remap_info @@ -253,6 +254,32 @@ static dxil_spv_bool dxil_remap(const struct vkd3d_dxil_remap_userdata *remap, return DXIL_SPV_TRUE; } +static void dxil_remap_check_hoist( + const struct vkd3d_shader_interface_info *shader_interface_info, + const struct vkd3d_dxil_remap_userdata *remap, const dxil_spv_d3d_binding *d3d_binding, + dxil_spv_vulkan_binding *vk_binding, + VkDescriptorType vk_descriptor_type) +{ + struct vkd3d_shader_meta_hoisted_desc *hoisted; + + if ((shader_interface_info->flags & VKD3D_SHADER_INTERFACE_HOIST_DESCRIPTORS) && + vk_binding->bindless.use_heap == DXIL_SPV_TRUE && + remap->meta->num_hoisted_descriptors < VKD3D_MAX_HOISTED_DESCRIPTORS && + d3d_binding->range_size == 1 && + vk_binding->bindless.heap_root_offset < VKD3D_MAX_HOISTED_DESCRIPTOR_CONSTANT_OFFSET) + { + hoisted = &remap->meta->hoist_desc[remap->meta->num_hoisted_descriptors]; + hoisted->table_index = vk_binding->root_constant_index + remap->num_root_descriptors * 2 - + shader_interface_info->descriptor_tables.offset / sizeof(uint32_t); + hoisted->constant_offset = vk_binding->bindless.heap_root_offset; + hoisted->vk_descriptor_type = vk_descriptor_type; + vk_binding->bindless.use_heap = DXIL_SPV_FALSE; + vk_binding->set = shader_interface_info->hoist_descriptor_set_index; + vk_binding->binding = remap->meta->num_hoisted_descriptors; + remap->meta->num_hoisted_descriptors++; + } +} + static dxil_spv_bool dxil_srv_remap(void *userdata, const dxil_spv_d3d_binding *d3d_binding, dxil_spv_srv_vulkan_binding *vk_binding) { @@ -275,6 +302,12 @@ static dxil_spv_bool dxil_srv_remap(void *userdata, const dxil_spv_d3d_binding * vk_binding->offset_binding.set = shader_interface_info->offset_buffer_binding->set; vk_binding->offset_binding.binding = shader_interface_info->offset_buffer_binding->binding; } + else + { + dxil_remap_check_hoist(shader_interface_info, remap, d3d_binding, &vk_binding->buffer_binding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + } + return DXIL_SPV_TRUE; } else @@ -287,16 +320,31 @@ static dxil_spv_bool dxil_srv_remap(void *userdata, const dxil_spv_d3d_binding * } } - return dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_SRV, - d3d_binding, &vk_binding->buffer_binding, resource_flags); + if (!dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_SRV, + d3d_binding, &vk_binding->buffer_binding, resource_flags)) + return DXIL_SPV_FALSE; + + if (!(shader_interface_info->flags & VKD3D_SHADER_INTERFACE_TYPED_OFFSET_BUFFER)) + { + dxil_remap_check_hoist(shader_interface_info, remap, d3d_binding, &vk_binding->buffer_binding, + d3d_binding->kind == DXIL_SPV_RESOURCE_KIND_TYPED_BUFFER ? + VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE); + } + + return DXIL_SPV_TRUE; } static dxil_spv_bool dxil_sampler_remap(void *userdata, const dxil_spv_d3d_binding *d3d_binding, dxil_spv_vulkan_binding *vk_binding) { const struct vkd3d_dxil_remap_userdata *remap = userdata; - return dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_SAMPLER, - d3d_binding, vk_binding, VKD3D_SHADER_BINDING_FLAG_IMAGE); + + if (!dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_SAMPLER, + d3d_binding, vk_binding, VKD3D_SHADER_BINDING_FLAG_IMAGE)) + return DXIL_SPV_FALSE; + + dxil_remap_check_hoist(remap->shader_interface_info, remap, d3d_binding, vk_binding, VK_DESCRIPTOR_TYPE_SAMPLER); + return DXIL_SPV_TRUE; } static dxil_spv_bool dxil_input_remap(void *userdata, const dxil_spv_d3d_vertex_input *d3d_input, @@ -421,6 +469,12 @@ static dxil_spv_bool dxil_uav_remap(void *userdata, const dxil_spv_uav_d3d_bindi vk_binding->offset_binding.set = shader_interface_info->offset_buffer_binding->set; vk_binding->offset_binding.binding = shader_interface_info->offset_buffer_binding->binding; } + else if (!d3d_binding->has_counter) + { + dxil_remap_check_hoist(shader_interface_info, remap, + &d3d_binding->d3d_binding, &vk_binding->buffer_binding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + } } else if (!dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_UAV, &d3d_binding->d3d_binding, &vk_binding->buffer_binding, resource_flags)) @@ -453,6 +507,13 @@ static dxil_spv_bool dxil_uav_remap(void *userdata, const dxil_spv_uav_d3d_bindi vk_binding->offset_binding.set = shader_interface_info->offset_buffer_binding->set; vk_binding->offset_binding.binding = shader_interface_info->offset_buffer_binding->binding; } + else if (!d3d_binding->has_counter) + { + dxil_remap_check_hoist(shader_interface_info, remap, + &d3d_binding->d3d_binding, &vk_binding->buffer_binding, + d3d_binding->d3d_binding.kind == DXIL_SPV_RESOURCE_KIND_TYPED_BUFFER ? + VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); + } } if (d3d_binding->has_counter) @@ -499,9 +560,14 @@ static dxil_spv_bool dxil_cbv_remap(void *userdata, const dxil_spv_d3d_binding * } vk_binding->push_constant = DXIL_SPV_FALSE; - return dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_CBV, + if (!dxil_remap(remap, VKD3D_SHADER_DESCRIPTOR_TYPE_CBV, d3d_binding, &vk_binding->vulkan.uniform_binding, - VKD3D_SHADER_BINDING_FLAG_BUFFER); + VKD3D_SHADER_BINDING_FLAG_BUFFER)) + return DXIL_SPV_FALSE; + + dxil_remap_check_hoist(remap->shader_interface_info, remap, d3d_binding, + &vk_binding->vulkan.uniform_binding, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + return DXIL_SPV_TRUE; } static void vkd3d_dxil_log_callback(void *userdata, dxil_spv_log_level level, const char *msg) @@ -1011,6 +1077,7 @@ int vkd3d_shader_compile_dxil(const struct vkd3d_shader_code *dxbc, remap_userdata.shader_interface_info = shader_interface_info; remap_userdata.shader_interface_local_info = NULL; remap_userdata.num_root_descriptors = num_root_descriptors; + remap_userdata.meta = &spirv->meta; dxil_spv_converter_set_root_constant_word_count(converter, root_constant_words); dxil_spv_converter_set_root_descriptor_count(converter, num_root_descriptors); @@ -1564,6 +1631,7 @@ int vkd3d_shader_compile_dxil_export(const struct vkd3d_shader_code *dxil, remap_userdata.shader_interface_info = shader_interface_info; remap_userdata.shader_interface_local_info = shader_interface_local_info; remap_userdata.num_root_descriptors = num_root_descriptors; + remap_userdata.meta = &spirv->meta; dxil_spv_converter_set_root_constant_word_count(converter, root_constant_words); dxil_spv_converter_set_root_descriptor_count(converter, num_root_descriptors); diff --git a/libs/vkd3d/cache.c b/libs/vkd3d/cache.c index 4175ed900d..a8c1a75d3f 100644 --- a/libs/vkd3d/cache.c +++ b/libs/vkd3d/cache.c @@ -181,7 +181,7 @@ VkResult vkd3d_create_pipeline_cache(struct d3d12_device *device, return VK_CALL(vkCreatePipelineCache(device->vk_device, &info, NULL, cache)); } -#define VKD3D_CACHE_BLOB_VERSION MAKE_MAGIC('V','K','B',3) +#define VKD3D_CACHE_BLOB_VERSION MAKE_MAGIC('V','K','B',4) enum vkd3d_pipeline_blob_chunk_type { @@ -1314,8 +1314,8 @@ struct vkd3d_serialized_pipeline_toc_entry }; STATIC_ASSERT(sizeof(struct vkd3d_serialized_pipeline_toc_entry) == 16); -#define VKD3D_PIPELINE_LIBRARY_VERSION_TOC MAKE_MAGIC('V','K','L',4) -#define VKD3D_PIPELINE_LIBRARY_VERSION_STREAM MAKE_MAGIC('V','K','S',4) +#define VKD3D_PIPELINE_LIBRARY_VERSION_TOC MAKE_MAGIC('V','K','L',5) +#define VKD3D_PIPELINE_LIBRARY_VERSION_STREAM MAKE_MAGIC('V','K','S',5) struct vkd3d_serialized_pipeline_library_toc { diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index a041bcf8f0..27c6b428e3 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -2698,14 +2698,6 @@ HRESULT d3d12_command_allocator_create(struct d3d12_device *device, return S_OK; } -struct vkd3d_scratch_allocation -{ - VkBuffer buffer; - VkDeviceSize offset; - VkDeviceAddress va; - void *host_ptr; -}; - static bool d3d12_command_allocator_allocate_scratch_memory(struct d3d12_command_allocator *allocator, enum vkd3d_scratch_pool_kind kind, VkDeviceSize size, VkDeviceSize alignment, uint32_t memory_types, @@ -4654,7 +4646,7 @@ static void d3d12_command_list_invalidate_root_parameters(struct d3d12_command_l if (bindings->root_signature->vk_sampler_descriptor_layout) bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_STATIC_SAMPLER_SET; if (bindings->root_signature->hoist_info.num_desc) - bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS; + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS; d3d12_command_list_invalidate_push_constants(bindings); @@ -4662,6 +4654,8 @@ static void d3d12_command_list_invalidate_root_parameters(struct d3d12_command_l { struct d3d12_device *device = bindings->root_signature->device; bindings->descriptor_heap_dirty_mask = (1ull << device->bindless_state.set_count) - 1; + if (list->state && list->state->hoist_template.num_hoist_sets) + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; } } @@ -5171,6 +5165,25 @@ static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list return S_OK; } +static void d3d12_command_list_finish_descriptor_copy_batch(struct d3d12_command_list *list) +{ + if (list->descriptor_copy_batch.num_copies) + { + struct vkd3d_initial_transition *transition; + vkd3d_array_reserve((void**)&list->init_transitions, &list->init_transitions_size, + list->init_transitions_count + 1, sizeof(*list->init_transitions)); + + transition = &list->init_transitions[list->init_transitions_count++]; + transition->type = VKD3D_INITIAL_TRANSITION_DESCRIPTOR_COPY_BATCH; + transition->descriptor_copy_batch.descriptor_buffer_va = list->descriptor_copy_batch.descriptor_buffer.va; + transition->descriptor_copy_batch.host_buffer_va = list->descriptor_copy_batch.host_buffer.va; + transition->descriptor_copy_batch.host_meta_va = list->descriptor_copy_batch.host_meta_buffer.va; + transition->descriptor_copy_batch.num_copies = list->descriptor_copy_batch.num_copies; + + memset(&list->descriptor_copy_batch, 0, sizeof(list->descriptor_copy_batch)); + } +} + static HRESULT STDMETHODCALLTYPE d3d12_command_list_Close(d3d12_command_list_iface *iface) { struct d3d12_command_list *list = impl_from_ID3D12GraphicsCommandList(iface); @@ -5196,6 +5209,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_list_Close(d3d12_command_list_ifa d3d12_command_list_end_current_render_pass(list, false); d3d12_command_list_end_transfer_batch(list); d3d12_command_list_flush_rtas_batch(list); + d3d12_command_list_finish_descriptor_copy_batch(list); if (list->predication.enabled_on_command_buffer) VK_CALL(vkCmdEndConditionalRenderingEXT(list->cmd.vk_command_buffer)); @@ -5406,8 +5420,10 @@ static void d3d12_command_list_init_default_descriptor_buffers(struct d3d12_comm { if (d3d12_device_uses_descriptor_buffers(list->device)) { - list->descriptor_heap.buffers.heap_va_resource = list->device->global_descriptor_buffer.resource.va; - list->descriptor_heap.buffers.heap_va_sampler = list->device->global_descriptor_buffer.sampler.va; + list->descriptor_heap.buffers.heap_va[0] = list->device->global_descriptor_buffer.resource.va; + list->descriptor_heap.buffers.heap_va[1] = list->device->global_descriptor_buffer.sampler.va; + list->descriptor_heap.buffers.mapped[0] = NULL; + list->descriptor_heap.buffers.mapped[1] = NULL; list->descriptor_heap.buffers.vk_buffer_resource = list->device->global_descriptor_buffer.resource.vk_buffer; list->descriptor_heap.buffers.heap_dirty = true; } @@ -5499,6 +5515,7 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l list->wbi_batch.batch_len = 0; list->query_resolve_count = 0; list->submit_allocator = NULL; + memset(&list->descriptor_copy_batch, 0, sizeof(list->descriptor_copy_batch)); d3d12_command_list_clear_rtas_batch(list); } @@ -5893,6 +5910,67 @@ static void d3d12_command_list_update_descriptor_table_offsets(struct d3d12_comm bindings->dirty_flags &= ~VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS; } +static void d3d12_command_list_update_hoisted_buffer_descriptors(struct d3d12_command_list *list, + struct vkd3d_pipeline_bindings *bindings, + VkPipelineBindPoint vk_bind_point) +{ + const struct d3d12_root_signature *root_signature = bindings->root_signature; + const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs; + const struct d3d12_descriptor_copy_template *copy_template; + struct d3d12_command_list_descriptor_copy_desc *copy_desc; + const struct d3d12_descriptor_copy_template_entry *entry; + struct d3d12_command_list_descriptor_copy_batch *batch; + static const uint32_t vk_buffer_indices[2] = { 2, 2 }; + const struct vkd3d_shader_descriptor_table *table; + uint32_t table_offsets[D3D12_MAX_ROOT_COST]; + unsigned int root_parameter_index; + uint64_t descriptor_table_mask; + VkDeviceSize vk_offsets[2]; + uint16_t base_dst_offset; + unsigned int i; + + batch = &list->descriptor_copy_batch; + copy_template = &list->state->hoist_template; + + copy_desc = batch->host_buffer.host_ptr; + copy_desc += batch->num_copies; + + descriptor_table_mask = copy_template->hoist_root_parameter_index_mask; + + while (descriptor_table_mask) + { + root_parameter_index = vkd3d_bitmask_iter64(&descriptor_table_mask); + table = root_signature_get_descriptor_table(root_signature, root_parameter_index); + table_offsets[table->table_index] = bindings->descriptor_tables[root_parameter_index]; + } + + base_dst_offset = batch->descriptor_buffer_offset / sizeof(uint32_t); + + for (i = 0; i < copy_template->num_entries; i++) + { + entry = ©_template->entries[i]; + copy_desc->set_index = entry->set_index; + copy_desc->count = entry->count; + copy_desc->src_offset = table_offsets[entry->table_index] + entry->constant_offset; + copy_desc->dst_offset = base_dst_offset + entry->dst_offset_words; + + copy_desc++; + batch->num_copies++; + } + + for (i = 0; i < copy_template->num_hoist_sets; i++) + vk_offsets[i] = batch->descriptor_buffer_offset + copy_template->descriptor_offsets[i]; + + /* This pipeline layout is compatible with the root signature so pushing here will not disturb anything. */ + VK_CALL(vkCmdSetDescriptorBufferOffsetsEXT(list->cmd.vk_command_buffer, vk_bind_point, + copy_template->vk_hoist_descriptor_layout, + copy_template->first_hoist_set_index, copy_template->num_hoist_sets, + vk_buffer_indices, vk_offsets)); + + batch->descriptor_buffer_offset += copy_template->descriptor_allocation_size; + bindings->dirty_flags &= ~VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; +} + static void vk_write_descriptor_set_from_root_descriptor(struct d3d12_command_list *list, VkWriteDescriptorSet *vk_descriptor_write, const struct vkd3d_shader_root_parameter *root_parameter, const struct vkd3d_root_descriptor_info *descriptor) @@ -5935,7 +6013,10 @@ static void d3d12_command_list_update_descriptor_buffers(struct d3d12_command_li { const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs; VkDescriptorBufferBindingPushDescriptorBufferHandleEXT buffer_handle; - VkDescriptorBufferBindingInfoEXT global_buffers[2]; + struct d3d12_command_list_descriptor_copy_heap *heaps; + VkDescriptorBufferBindingInfoEXT global_buffers[3]; + uint32_t num_global_buffers; + unsigned int i; if (d3d12_device_uses_descriptor_buffers(list->device) && list->descriptor_heap.buffers.heap_dirty) @@ -5943,7 +6024,7 @@ static void d3d12_command_list_update_descriptor_buffers(struct d3d12_command_li global_buffers[0].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; global_buffers[0].pNext = NULL; global_buffers[0].usage = list->device->global_descriptor_buffer.resource.usage; - global_buffers[0].address = list->descriptor_heap.buffers.heap_va_resource; + global_buffers[0].address = list->descriptor_heap.buffers.heap_va[0]; if (global_buffers[0].usage & VK_BUFFER_USAGE_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT) { @@ -5956,10 +6037,51 @@ static void d3d12_command_list_update_descriptor_buffers(struct d3d12_command_li global_buffers[1].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; global_buffers[1].pNext = NULL; global_buffers[1].usage = list->device->global_descriptor_buffer.sampler.usage; - global_buffers[1].address = list->descriptor_heap.buffers.heap_va_sampler; + global_buffers[1].address = list->descriptor_heap.buffers.heap_va[1]; + + num_global_buffers = 2; + + if (list->device->bindless_state.flags & VKD3D_BINDLESS_HOIST_DESCRIPTOR_BUFFER) + { + d3d12_command_list_finish_descriptor_copy_batch(list); + + d3d12_command_allocator_allocate_scratch_memory(list->allocator, + VKD3D_SCRATCH_POOL_KIND_DESCRIPTOR_BUFFER, + VKD3D_DESCRIPTOR_COPY_BATCH_DESCRIPTOR_BUFFER_SIZE, + list->device->device_info.descriptor_buffer_properties.descriptorBufferOffsetAlignment, + ~0u, &list->descriptor_copy_batch.descriptor_buffer); + + d3d12_command_allocator_allocate_scratch_memory(list->allocator, + VKD3D_SCRATCH_POOL_KIND_UNIFORM_UPLOAD, + VKD3D_DESCRIPTOR_COPY_BATCH_NUM_COPIES * sizeof(struct d3d12_command_list_descriptor_copy_desc), + 64, ~0u, &list->descriptor_copy_batch.host_buffer); + + d3d12_command_allocator_allocate_scratch_memory(list->allocator, + VKD3D_SCRATCH_POOL_KIND_UNIFORM_UPLOAD, + sizeof(struct d3d12_command_list_descriptor_copy_heap) * VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS, + 64, ~0u, &list->descriptor_copy_batch.host_meta_buffer); + + heaps = list->descriptor_copy_batch.host_meta_buffer.host_ptr; + + for (i = 0; i < list->device->bindless_state.set_count; i++) + { + unsigned int buffer_index = list->device->bindless_state.vk_descriptor_buffer_indices[i]; + heaps[i].base_va = global_buffers[buffer_index].address + list->descriptor_heap.buffers.vk_payload_offsets[i]; + heaps[i].num_descriptors = list->descriptor_heap.buffers.vk_descriptor_count_for_buffer_index[buffer_index]; + heaps[i].stride_words = list->descriptor_heap.buffers.vk_descriptor_stride_words[i]; + } + + global_buffers[num_global_buffers].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + global_buffers[num_global_buffers].pNext = NULL; + global_buffers[num_global_buffers].usage = + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT; + global_buffers[num_global_buffers].address = list->descriptor_copy_batch.descriptor_buffer.va; + num_global_buffers++; + } VK_CALL(vkCmdBindDescriptorBuffersEXT(list->cmd.vk_command_buffer, - ARRAY_SIZE(global_buffers), global_buffers)); + num_global_buffers, global_buffers)); list->descriptor_heap.buffers.heap_dirty = false; } @@ -5985,7 +6107,7 @@ static void d3d12_command_list_update_descriptor_heaps(struct d3d12_command_list VK_CALL(vkCmdSetDescriptorBufferOffsetsEXT(list->cmd.vk_command_buffer, vk_bind_point, layout, 0, bindless_state->set_count, bindless_state->vk_descriptor_buffer_indices, - list->descriptor_heap.buffers.vk_offsets)); + list->descriptor_heap.buffers.vk_bind_offsets)); bindings->descriptor_heap_dirty_mask = 0; } } @@ -6214,7 +6336,7 @@ static void d3d12_command_list_update_root_descriptors(struct d3d12_command_list } } -static void d3d12_command_list_update_hoisted_descriptors(struct d3d12_command_list *list, +static void d3d12_command_list_update_hoisted_push_descriptors(struct d3d12_command_list *list, struct vkd3d_pipeline_bindings *bindings) { const struct d3d12_root_signature *rs = bindings->root_signature; @@ -6267,7 +6389,27 @@ static void d3d12_command_list_update_hoisted_descriptors(struct d3d12_command_l } } - bindings->dirty_flags &= ~VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS; + bindings->dirty_flags &= ~VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS; +} + +static void d3d12_command_list_reserve_hoisted_buffer_descriptor(struct d3d12_command_list *list, + struct vkd3d_pipeline_bindings *bindings) +{ + assert(list->state); + assert(list->state->hoist_template.num_hoist_sets); + + /* Check if we have exhausted the current batch. */ + if ((list->state->hoist_template.descriptor_allocation_size + + list->descriptor_copy_batch.descriptor_buffer_offset > + VKD3D_DESCRIPTOR_COPY_BATCH_DESCRIPTOR_BUFFER_SIZE) || + (list->state->hoist_template.num_entries + list->descriptor_copy_batch.num_copies > + VKD3D_DESCRIPTOR_COPY_BATCH_NUM_COPIES)) + { + list->descriptor_heap.buffers.heap_dirty = true; + bindings->descriptor_heap_dirty_mask |= ~0u; + /* d3d12_command_list_update_descriptor_heaps will be called. + * Since heap_dirty it set it will rebind descriptor buffers, and ensure misc descriptor state is marked dirty. */ + } } static void d3d12_command_list_update_descriptors(struct d3d12_command_list *list) @@ -6288,6 +6430,9 @@ static void d3d12_command_list_update_descriptors(struct d3d12_command_list *lis vk_bind_point = vk_bind_point_from_pipeline_type(list->active_pipeline_type); + if (bindings->dirty_flags & VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS) + d3d12_command_list_reserve_hoisted_buffer_descriptor(list, bindings); + if (bindings->descriptor_heap_dirty_mask) d3d12_command_list_update_descriptor_heaps(list, bindings, vk_bind_point, layout); @@ -6295,8 +6440,8 @@ static void d3d12_command_list_update_descriptors(struct d3d12_command_list *lis d3d12_command_list_update_static_samplers(list, bindings, vk_bind_point, layout); /* If we can, hoist descriptors from the descriptor heap into fake root parameters. */ - if (bindings->dirty_flags & VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS) - d3d12_command_list_update_hoisted_descriptors(list, bindings); + if (bindings->dirty_flags & VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS) + d3d12_command_list_update_hoisted_push_descriptors(list, bindings); if (bind_point_layout->flags & VKD3D_ROOT_SIGNATURE_USE_PUSH_CONSTANT_UNIFORM_BLOCK) { @@ -6322,6 +6467,9 @@ static void d3d12_command_list_update_descriptors(struct d3d12_command_list *lis if (bindings->dirty_flags & VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS) d3d12_command_list_update_descriptor_table_offsets(list, bindings, layout, push_stages); } + + if (bindings->dirty_flags & VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS) + d3d12_command_list_update_hoisted_buffer_descriptors(list, bindings, vk_bind_point); } static void d3d12_command_list_update_descriptors_post_indirect_buffer(struct d3d12_command_list *list) @@ -8796,25 +8944,27 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPipelineState(d3d12_command_ list->state = state; list->rt_state = NULL; - if (!state || list->active_pipeline_type != state->pipeline_type) + if (!state) { - if (state) - { - bindings = d3d12_command_list_get_bindings(list, state->pipeline_type); - if (bindings->root_signature) - { - /* We might have clobbered push constants in the new bind point, - * invalidate all state which can affect push constants. - * We might also change the pipeline layout, in case we switch between mesh and legacy graphics. - * In this scenario, the push constant layout will be incompatible due to stage - * differences, so everything must be rebound. */ - d3d12_command_list_invalidate_root_parameters(list, bindings, true, NULL); - } + list->active_pipeline_type = VKD3D_PIPELINE_TYPE_NONE; + return; + } - list->active_pipeline_type = state->pipeline_type; + bindings = d3d12_command_list_get_bindings(list, state->pipeline_type); + + if (list->active_pipeline_type != state->pipeline_type) + { + if (bindings->root_signature) + { + /* We might have clobbered push constants in the new bind point, + * invalidate all state which can affect push constants. + * We might also change the pipeline layout, in case we switch between mesh and legacy graphics. + * In this scenario, the push constant layout will be incompatible due to stage + * differences, so everything must be rebound. */ + d3d12_command_list_invalidate_root_parameters(list, bindings, true, NULL); } - else - list->active_pipeline_type = VKD3D_PIPELINE_TYPE_NONE; + + list->active_pipeline_type = state->pipeline_type; } if (state->pipeline_type != VKD3D_PIPELINE_TYPE_COMPUTE) @@ -8827,6 +8977,14 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPipelineState(d3d12_command_ list->dynamic_state.dirty_flags |= VKD3D_DYNAMIC_STATE_STENCIL_WRITE_MASK; } } + + /* TODO: It's possible we can use some kind of compatibilty hash of the hoist descriptor + * to avoid hoisting, but changing pipeline without modifying the root table parameters + * is very awkward. */ + if (state->hoist_template.num_hoist_sets) + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; + else + bindings->dirty_flags &= ~VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; } VkImageLayout vk_image_layout_from_d3d12_resource_state( @@ -9514,7 +9672,7 @@ static void STDMETHODCALLTYPE d3d12_command_list_ExecuteBundle(d3d12_command_lis static void vkd3d_pipeline_bindings_set_dirty_sets(struct vkd3d_pipeline_bindings *bindings, uint64_t dirty_mask) { bindings->descriptor_heap_dirty_mask = dirty_mask; - bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS; + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS; } static void d3d12_command_list_set_descriptor_heaps_buffers(struct d3d12_command_list *list, @@ -9525,8 +9683,8 @@ static void d3d12_command_list_set_descriptor_heaps_buffers(struct d3d12_command struct d3d12_desc_split d; unsigned int i, j; - current_resource_va = list->descriptor_heap.buffers.heap_va_resource; - current_sampler_va = list->descriptor_heap.buffers.heap_va_sampler; + current_resource_va = list->descriptor_heap.buffers.heap_va[0]; + current_sampler_va = list->descriptor_heap.buffers.heap_va[1]; for (i = 0; i < heap_count; i++) { @@ -9538,8 +9696,10 @@ static void d3d12_command_list_set_descriptor_heaps_buffers(struct d3d12_command if (heap->desc.Type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV) { - list->descriptor_heap.buffers.heap_va_resource = heap->descriptor_buffer.va; + list->descriptor_heap.buffers.heap_va[0] = heap->descriptor_buffer.va; + list->descriptor_heap.buffers.mapped[0] = heap->descriptor_buffer.host_allocation; list->descriptor_heap.buffers.vk_buffer_resource = heap->descriptor_buffer.vk_buffer; + list->descriptor_heap.buffers.vk_descriptor_count_for_buffer_index[0] = heap->desc.NumDescriptors; if (!d3d12_device_use_embedded_mutable_descriptors(list->device)) { @@ -9550,16 +9710,27 @@ static void d3d12_command_list_set_descriptor_heaps_buffers(struct d3d12_command } else if (heap->desc.Type == D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER) { - list->descriptor_heap.buffers.heap_va_sampler = heap->descriptor_buffer.va; + list->descriptor_heap.buffers.heap_va[1] = heap->descriptor_buffer.va; + list->descriptor_heap.buffers.mapped[1] = heap->descriptor_buffer.host_allocation; + list->descriptor_heap.buffers.vk_descriptor_count_for_buffer_index[1] = heap->desc.NumDescriptors; } for (j = 0; j < bindless_state->set_count; j++) + { if (bindless_state->set_info[j].heap_type == heap->desc.Type) - list->descriptor_heap.buffers.vk_offsets[j] = heap->descriptor_buffer.offsets[set_index++]; + { + list->descriptor_heap.buffers.vk_bind_offsets[j] = heap->descriptor_buffer.offsets[set_index]; + list->descriptor_heap.buffers.vk_payload_offsets[j] = + heap->descriptor_buffer.offsets[set_index] + bindless_state->set_info[j].host_mapping_offset; + list->descriptor_heap.buffers.vk_descriptor_stride_words[j] = + bindless_state->set_info[j].host_mapping_descriptor_size / sizeof(uint32_t); + set_index++; + } + } } - if (current_resource_va == list->descriptor_heap.buffers.heap_va_resource && - current_sampler_va == list->descriptor_heap.buffers.heap_va_sampler) + if (current_resource_va == list->descriptor_heap.buffers.heap_va[0] && + current_sampler_va == list->descriptor_heap.buffers.heap_va[1]) return; list->descriptor_heap.buffers.heap_dirty = true; @@ -9675,9 +9846,14 @@ static inline void d3d12_command_list_set_descriptor_table_embedded(struct d3d12 if (root_signature) { if (root_signature->descriptor_table_count) + { bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS; + if (list->state && (list->state->hoist_template.hoist_root_parameter_index_mask & (1ull << index))) + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; + } + if (root_signature->hoist_info.num_desc) - bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS; + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS; } VKD3D_BREADCRUMB_AUX32(index); @@ -9696,9 +9872,14 @@ static inline void d3d12_command_list_set_descriptor_table(struct d3d12_command_ if (root_signature) { if (root_signature->descriptor_table_count) + { bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS; + if (list->state && (list->state->hoist_template.hoist_root_parameter_index_mask & (1ull << index))) + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; + } + if (root_signature->hoist_info.num_desc) - bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS; + bindings->dirty_flags |= VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS; } VKD3D_BREADCRUMB_AUX32(index); @@ -14446,6 +14627,8 @@ static void STDMETHODCALLTYPE d3d12_command_list_SetPipelineState1(d3d12_command } } #endif + + list->compute_bindings.dirty_flags &= ~VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS; } static VkStridedDeviceAddressRegionKHR convert_strided_range( @@ -15866,17 +16049,7 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm buffer->commandBuffer = command_queue->vkd3d_queue->barrier_command_buffer; } - if (command_list_count == 1 && num_transitions != 0) - { - /* Pilfer directly. */ - cmd_list = unsafe_impl_from_ID3D12CommandList(command_lists[0]); - sub.execute.transitions = cmd_list->init_transitions; - sub.execute.transition_count = cmd_list->init_transitions_count; - cmd_list->init_transitions = NULL; - cmd_list->init_transitions_count = 0; - cmd_list->init_transitions_size = 0; - } - else if (num_transitions != 0) + if (num_transitions != 0) { sub.execute.transitions = vkd3d_malloc(num_transitions * sizeof(*sub.execute.transitions)); sub.execute.transition_count = num_transitions; @@ -16297,9 +16470,13 @@ static void d3d12_command_queue_signal_shared(struct d3d12_command_queue *comman struct d3d12_command_queue_transition_pool { VkCommandBuffer cmd[VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS]; + VkCommandBuffer async_cmd[VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS]; VkCommandPool pool; + VkCommandPool async_pool; VkSemaphore timeline; uint64_t timeline_value; + VkSemaphore async_timeline; + uint64_t async_timeline_value; VkImageMemoryBarrier2 *barriers; size_t barriers_size; @@ -16329,6 +16506,15 @@ static HRESULT d3d12_command_queue_transition_pool_init(struct d3d12_command_que if ((vr = VK_CALL(vkCreateCommandPool(queue->device->vk_device, &pool_info, NULL, &pool->pool)))) return hresult_from_vk_result(vr); + if (queue->device->memory_transfers.vkd3d_queue->vk_queue != queue->vkd3d_queue->vk_queue) + { + /* When we don't need to perform initializations in a serialized fashion, we can do the work on the + * memory transfer queue out of bounds instead. */ + pool_info.queueFamilyIndex = queue->device->memory_transfers.vkd3d_queue->vk_family_index; + if ((vr = VK_CALL(vkCreateCommandPool(queue->device->vk_device, &pool_info, NULL, &pool->async_pool)))) + return hresult_from_vk_result(vr); + } + alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; alloc_info.pNext = NULL; alloc_info.commandPool = pool->pool; @@ -16340,11 +16526,20 @@ static HRESULT d3d12_command_queue_transition_pool_init(struct d3d12_command_que if (FAILED(hr = vkd3d_create_timeline_semaphore(queue->device, 0, false, &pool->timeline))) return hr; + if (pool->async_pool) + { + alloc_info.commandPool = pool->async_pool; + if ((vr = VK_CALL(vkAllocateCommandBuffers(queue->device->vk_device, &alloc_info, pool->async_cmd)))) + return hresult_from_vk_result(vr); + if (FAILED(hr = vkd3d_create_timeline_semaphore(queue->device, 0, false, &pool->async_timeline))) + return hr; + } + return S_OK; } static void d3d12_command_queue_transition_pool_wait(struct d3d12_command_queue_transition_pool *pool, - struct d3d12_device *device, uint64_t value) + struct d3d12_device *device, VkSemaphore timeline, uint64_t value) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreWaitInfo wait_info; @@ -16353,7 +16548,7 @@ static void d3d12_command_queue_transition_pool_wait(struct d3d12_command_queue_ wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; wait_info.pNext = NULL; wait_info.flags = 0; - wait_info.pSemaphores = &pool->timeline; + wait_info.pSemaphores = &timeline; wait_info.semaphoreCount = 1; wait_info.pValues = &value; vr = VK_CALL(vkWaitSemaphores(device->vk_device, &wait_info, ~(uint64_t)0)); @@ -16364,9 +16559,12 @@ static void d3d12_command_queue_transition_pool_deinit(struct d3d12_command_queu struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - d3d12_command_queue_transition_pool_wait(pool, device, pool->timeline_value); + d3d12_command_queue_transition_pool_wait(pool, device, pool->timeline, pool->timeline_value); + d3d12_command_queue_transition_pool_wait(pool, device, pool->async_timeline, pool->async_timeline_value); VK_CALL(vkDestroyCommandPool(device->vk_device, pool->pool, NULL)); + VK_CALL(vkDestroyCommandPool(device->vk_device, pool->async_pool, NULL)); VK_CALL(vkDestroySemaphore(device->vk_device, pool->timeline, NULL)); + VK_CALL(vkDestroySemaphore(device->vk_device, pool->async_timeline, NULL)); vkd3d_free(pool->barriers); vkd3d_free((void*)pool->query_heaps); } @@ -16431,24 +16629,87 @@ static void d3d12_command_queue_init_query_heap(struct d3d12_device *device, VkC } } +static void d3d12_command_queue_copy_descriptor_batch(struct d3d12_device *device, + VkCommandBuffer vk_cmd_buffer, const struct vkd3d_descriptor_copy_meta_args *args) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + struct vkd3d_descriptor_copy_info info; + + vkd3d_meta_get_descriptor_copy_pipeline(&device->meta_ops, &info); + VK_CALL(vkCmdBindPipeline(vk_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, info.vk_pipeline)); + VK_CALL(vkCmdPushConstants(vk_cmd_buffer, info.vk_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(*args), args)); + VK_CALL(vkCmdDispatch(vk_cmd_buffer, vkd3d_compute_workgroup_count(args->num_copies, + vkd3d_meta_get_descriptor_copy_workgroup_size()), 1, 1)); +} + +static void d3d12_command_queue_transition_pool_submit_async(struct d3d12_command_queue_transition_pool *pool, + struct d3d12_device *device, VkCommandBuffer vk_cmd_buffer) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VkSemaphoreSubmitInfo async_semaphore; + VkCommandBufferSubmitInfo cmd; + VkSubmitInfo2 submit; + VkQueue vk_queue; + VkResult vr; + + vk_queue = vkd3d_queue_acquire(device->memory_transfers.vkd3d_queue); + if (!vk_queue) + { + ERR("Failed to acquire memory transfer queue.\n"); + return; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO; + cmd.commandBuffer = vk_cmd_buffer; + + memset(&async_semaphore, 0, sizeof(async_semaphore)); + async_semaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; + async_semaphore.semaphore = pool->async_timeline; + async_semaphore.value = pool->async_timeline_value; + + memset(&submit, 0, sizeof(submit)); + submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2; + submit.pSignalSemaphoreInfos = &async_semaphore; + submit.signalSemaphoreInfoCount = 1; + submit.commandBufferInfoCount = 1; + submit.pCommandBufferInfos = &cmd; + vr = VK_CALL(vkQueueSubmit2(vk_queue, 1, &submit, NULL)); + vkd3d_queue_release(device->memory_transfers.vkd3d_queue); + if (vr < 0) + ERR("Failed to submit async transition, vr %d.\n", vr); + VKD3D_DEVICE_REPORT_BREADCRUMB_IF(device, vr == VK_ERROR_DEVICE_LOST); +} + static void d3d12_command_queue_transition_pool_build(struct d3d12_command_queue_transition_pool *pool, struct d3d12_device *device, const struct vkd3d_initial_transition *transitions, size_t count, - VkCommandBuffer *vk_cmd_buffer, uint64_t *timeline_value) + VkCommandBuffer *out_vk_cmd_buffer, VkSemaphoreSubmitInfo *out_semaphore) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; const struct vkd3d_initial_transition *transition; VkCommandBufferBeginInfo begin_info; + uint64_t *target_timeline_value; + VkCommandBuffer vk_cmd_buffer; + VkSemaphore *target_timeline; + VkMemoryBarrier2 vk_barrier; + bool need_synchronous_init; unsigned int command_index; + bool need_descriptor_copy; VkDependencyInfo dep_info; uint32_t need_transition; size_t i; pool->barriers_count = 0; pool->query_heaps_count = 0; + need_descriptor_copy = false; + need_synchronous_init = false; if (!count) { - *vk_cmd_buffer = VK_NULL_HANDLE; + *out_vk_cmd_buffer = VK_NULL_HANDLE; + out_semaphore->semaphore = VK_NULL_HANDLE; + out_semaphore->value = 0; return; } @@ -16466,7 +16727,14 @@ static void d3d12_command_queue_transition_pool_build(struct d3d12_command_queue 0, vkd3d_memory_order_relaxed); if (need_transition && transition->resource.perform_initial_transition) + { + /* If we're initializing committed resources, there's no need to do the transition in a synchronized way. + * Can punt that to async queue. For placed resources, we may have to do it synchronized due to aliasing rules + * and ExecuteCommandLists having implicit sync between them. */ + if (d3d12_resource_may_alias_other_resources(transition->resource.resource)) + need_synchronous_init = true; d3d12_command_queue_transition_pool_add_barrier(pool, transition->resource.resource); + } break; case VKD3D_INITIAL_TRANSITION_TYPE_QUERY_HEAP: @@ -16474,43 +16742,106 @@ static void d3d12_command_queue_transition_pool_build(struct d3d12_command_queue d3d12_command_queue_transition_pool_add_query_heap(pool, transition->query_heap); break; + case VKD3D_INITIAL_TRANSITION_DESCRIPTOR_COPY_BATCH: + /* Deal with it later once we have figured out which command buffer to work on. */ + need_descriptor_copy = true; + break; + default: ERR("Unhandled transition type %u.\n", transition->type); } } - if (!pool->barriers_count && !pool->query_heaps_count) + if (!pool->barriers_count && !pool->query_heaps_count && !need_descriptor_copy) { - *vk_cmd_buffer = VK_NULL_HANDLE; + *out_vk_cmd_buffer = VK_NULL_HANDLE; + out_semaphore->semaphore = VK_NULL_HANDLE; + out_semaphore->value = 0; return; } - pool->timeline_value++; - command_index = pool->timeline_value % VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS; + if (!pool->async_pool) + need_synchronous_init = true; + + if (need_synchronous_init) + { + target_timeline = &pool->timeline; + target_timeline_value = &pool->timeline_value; + } + else + { + target_timeline = &pool->async_timeline; + target_timeline_value = &pool->async_timeline_value; + } + + (*target_timeline_value)++; + command_index = *target_timeline_value % VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS; - if (pool->timeline_value > VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS) - d3d12_command_queue_transition_pool_wait(pool, device, pool->timeline_value - VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS); + if (*target_timeline_value > VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS) + { + d3d12_command_queue_transition_pool_wait(pool, device, + *target_timeline, *target_timeline_value - VKD3D_COMMAND_QUEUE_NUM_TRANSITION_BUFFERS); + } + + if (need_synchronous_init) + vk_cmd_buffer = pool->cmd[command_index]; + else + vk_cmd_buffer = pool->async_cmd[command_index]; begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; begin_info.pNext = NULL; begin_info.pInheritanceInfo = NULL; begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CALL(vkResetCommandBuffer(pool->cmd[command_index], 0)); - VK_CALL(vkBeginCommandBuffer(pool->cmd[command_index], &begin_info)); + VK_CALL(vkResetCommandBuffer(vk_cmd_buffer, 0)); + VK_CALL(vkBeginCommandBuffer(vk_cmd_buffer, &begin_info)); memset(&dep_info, 0, sizeof(dep_info)); dep_info.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO; dep_info.imageMemoryBarrierCount = pool->barriers_count; dep_info.pImageMemoryBarriers = pool->barriers; - VK_CALL(vkCmdPipelineBarrier2(pool->cmd[command_index], &dep_info)); + if (pool->barriers_count) + VK_CALL(vkCmdPipelineBarrier2(vk_cmd_buffer, &dep_info)); for (i = 0; i < pool->query_heaps_count; i++) - d3d12_command_queue_init_query_heap(device, pool->cmd[command_index], pool->query_heaps[i]); - VK_CALL(vkEndCommandBuffer(pool->cmd[command_index])); + d3d12_command_queue_init_query_heap(device, vk_cmd_buffer, pool->query_heaps[i]); - *vk_cmd_buffer = pool->cmd[command_index]; - *timeline_value = pool->timeline_value; + for (i = 0; i < count; i++) + if (transitions[i].type == VKD3D_INITIAL_TRANSITION_DESCRIPTOR_COPY_BATCH) + d3d12_command_queue_copy_descriptor_batch(device, vk_cmd_buffer, &transitions[i].descriptor_copy_batch); + + if (need_descriptor_copy && need_synchronous_init) + { + memset(&vk_barrier, 0, sizeof(vk_barrier)); + vk_barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2; + vk_barrier.srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT; + vk_barrier.srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT; + vk_barrier.dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + vk_barrier.dstAccessMask = VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT; + + memset(&dep_info, 0, sizeof(dep_info)); + dep_info.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO; + dep_info.memoryBarrierCount = 1; + dep_info.pMemoryBarriers = &vk_barrier; + + VK_CALL(vkCmdPipelineBarrier2(vk_cmd_buffer, &dep_info)); + } + + VK_CALL(vkEndCommandBuffer(vk_cmd_buffer)); + + if (need_synchronous_init) + { + *out_vk_cmd_buffer = vk_cmd_buffer; + out_semaphore->semaphore = pool->timeline; + out_semaphore->value = pool->timeline_value; + } + else + { + d3d12_command_queue_transition_pool_submit_async(pool, device, vk_cmd_buffer); + *out_vk_cmd_buffer = VK_NULL_HANDLE; + out_semaphore->semaphore = pool->async_timeline; + out_semaphore->value = pool->async_timeline_value; + } } static VkResult d3d12_command_queue_submit_split_locked(struct d3d12_device *device, @@ -16612,6 +16943,13 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu else { num_submits = 1; + + /* If we did a transition, but in the async queue, wait for it here. */ + if (transition_semaphore->value) + { + submit_desc[0].waitSemaphoreInfoCount = 1; + submit_desc[0].pWaitSemaphoreInfos = transition_semaphore; + } } if (!(vk_queue = vkd3d_queue_acquire(vkd3d_queue))) @@ -17142,14 +17480,13 @@ static void *d3d12_command_queue_submission_worker_main(void *userdata) memset(&transition_semaphore, 0, sizeof(transition_semaphore)); transition_semaphore.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO; - transition_semaphore.semaphore = pool.timeline; transition_semaphore.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; d3d12_command_queue_transition_pool_build(&pool, queue->device, submission.execute.transitions, submission.execute.transition_count, &transition_cmd.commandBuffer, - &transition_semaphore.value); + &transition_semaphore); d3d12_command_queue_execute(queue, submission.execute.cmd, submission.execute.cmd_count, diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index c7a1054f42..f4935cb126 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -867,6 +867,7 @@ static const struct vkd3d_debug_option vkd3d_config_options[] = {"disable_uav_compression", VKD3D_CONFIG_FLAG_DISABLE_UAV_COMPRESSION}, {"disable_depth_compression", VKD3D_CONFIG_FLAG_DISABLE_DEPTH_COMPRESSION}, {"disable_color_compression", VKD3D_CONFIG_FLAG_DISABLE_COLOR_COMPRESSION}, + {"descriptor_hoisting", VKD3D_CONFIG_FLAG_DESCRIPTOR_HOISTING}, }; static void vkd3d_config_flags_init_once(void) @@ -2898,6 +2899,7 @@ static HRESULT d3d12_device_create_scratch_buffer(struct d3d12_device *device, e alloc_info.heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | D3D12_HEAP_FLAG_CREATE_NOT_ZEROED; alloc_info.extra_allocation_flags = VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH; alloc_info.vk_memory_priority = vkd3d_convert_to_vk_prio(D3D12_RESIDENCY_PRIORITY_NORMAL); + alloc_info.explicit_global_buffer_usage = 0; if (FAILED(hr = vkd3d_allocate_heap_memory(device, &device->memory_allocator, &alloc_info, &scratch->allocation))) @@ -2935,6 +2937,31 @@ static HRESULT d3d12_device_create_scratch_buffer(struct d3d12_device *device, e alloc_info.heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | D3D12_HEAP_FLAG_CREATE_NOT_ZEROED; alloc_info.extra_allocation_flags = VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH; alloc_info.vk_memory_priority = vkd3d_convert_to_vk_prio(D3D12_RESIDENCY_PRIORITY_NORMAL); + alloc_info.explicit_global_buffer_usage = 0; + + if (FAILED(hr = vkd3d_allocate_heap_memory(device, &device->memory_allocator, + &alloc_info, &scratch->allocation))) + return hr; + } + else if (kind == VKD3D_SCRATCH_POOL_KIND_DESCRIPTOR_BUFFER) + { + struct vkd3d_allocate_heap_memory_info alloc_info; + + /* We only care about memory types for INDIRECT_PREPROCESS. */ + assert(memory_types == ~0u); + + memset(&alloc_info, 0, sizeof(alloc_info)); + alloc_info.heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; + alloc_info.heap_desc.SizeInBytes = size; + alloc_info.heap_desc.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + alloc_info.heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | D3D12_HEAP_FLAG_CREATE_NOT_ZEROED; + alloc_info.extra_allocation_flags = + VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH; + alloc_info.vk_memory_priority = vkd3d_convert_to_vk_prio(D3D12_RESIDENCY_PRIORITY_NORMAL); + alloc_info.explicit_global_buffer_usage = VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; if (FAILED(hr = vkd3d_allocate_heap_memory(device, &device->memory_allocator, &alloc_info, &scratch->allocation))) @@ -8301,11 +8328,11 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, if (FAILED(hr = vkd3d_sampler_state_init(&device->sampler_state, device))) goto out_cleanup_view_map; - if (FAILED(hr = vkd3d_meta_ops_init(&device->meta_ops, device))) + if (FAILED(hr = vkd3d_shader_debug_ring_init(&device->debug_ring, device))) goto out_cleanup_sampler_state; - if (FAILED(hr = vkd3d_shader_debug_ring_init(&device->debug_ring, device))) - goto out_cleanup_meta_ops; + if (FAILED(hr = vkd3d_meta_ops_init(&device->meta_ops, device))) + goto out_cleanup_debug_ring; vkd3d_scratch_pool_init(device); @@ -8313,7 +8340,7 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, vkd3d_breadcrumb_tracer_init_barrier_hashes(&device->breadcrumb_tracer); if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS) if (FAILED(hr = vkd3d_breadcrumb_tracer_init(&device->breadcrumb_tracer, device))) - goto out_cleanup_debug_ring; + goto out_cleanup_meta_ops; #endif if (vkd3d_descriptor_debug_active_qa_checks()) @@ -8360,12 +8387,12 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, #ifdef VKD3D_ENABLE_BREADCRUMBS if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS) vkd3d_breadcrumb_tracer_cleanup(&device->breadcrumb_tracer, device); -out_cleanup_debug_ring: +out_cleanup_meta_ops: vkd3d_breadcrumb_tracer_cleanup_barrier_hashes(&device->breadcrumb_tracer); #endif - vkd3d_shader_debug_ring_cleanup(&device->debug_ring, device); -out_cleanup_meta_ops: vkd3d_meta_ops_cleanup(&device->meta_ops, device); +out_cleanup_debug_ring: + vkd3d_shader_debug_ring_cleanup(&device->debug_ring, device); out_cleanup_sampler_state: vkd3d_sampler_state_cleanup(&device->sampler_state, device); out_cleanup_view_map: diff --git a/libs/vkd3d/heap.c b/libs/vkd3d/heap.c index f7818abbe4..fa971b26d4 100644 --- a/libs/vkd3d/heap.c +++ b/libs/vkd3d/heap.c @@ -280,6 +280,7 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, struct d3d12_device *dev alloc_info.heap_desc = heap->desc; alloc_info.host_ptr = host_address; alloc_info.extra_allocation_flags = 0; + alloc_info.explicit_global_buffer_usage = 0; if (FAILED(hr = vkd3d_private_store_init(&heap->private_store))) return hr; diff --git a/libs/vkd3d/memory.c b/libs/vkd3d/memory.c index 22528a4108..e817b9e9ec 100644 --- a/libs/vkd3d/memory.c +++ b/libs/vkd3d/memory.c @@ -1766,6 +1766,7 @@ HRESULT vkd3d_allocate_heap_memory(struct d3d12_device *device, struct vkd3d_mem alloc_info.heap_flags = info->heap_desc.Flags; alloc_info.host_ptr = info->host_ptr; alloc_info.vk_memory_priority = info->vk_memory_priority; + alloc_info.explicit_global_buffer_usage = info->explicit_global_buffer_usage; alloc_info.flags |= info->extra_allocation_flags; if (!(info->heap_desc.Flags & D3D12_HEAP_FLAG_DENY_BUFFERS)) diff --git a/libs/vkd3d/meson.build b/libs/vkd3d/meson.build index 54955a89e8..8cffb0078e 100644 --- a/libs/vkd3d/meson.build +++ b/libs/vkd3d/meson.build @@ -40,7 +40,9 @@ vkd3d_shaders =[ 'shaders/fs_sampler_feedback_decode_image_mip_used.frag', 'shaders/cs_sampler_feedback_encode_buffer_min_mip.comp', 'shaders/cs_sampler_feedback_encode_image_min_mip.comp', - 'shaders/cs_sampler_feedback_encode_image_mip_used.comp' + 'shaders/cs_sampler_feedback_encode_image_mip_used.comp', + + 'shaders/cs_copy_descriptors.comp', ] vkd3d_src = [ diff --git a/libs/vkd3d/meta.c b/libs/vkd3d/meta.c index 5e948a643f..b641e33b73 100644 --- a/libs/vkd3d/meta.c +++ b/libs/vkd3d/meta.c @@ -1669,6 +1669,52 @@ static void vkd3d_sampler_feedback_ops_cleanup(struct vkd3d_sampler_feedback_res VK_CALL(vkDestroyPipeline(device->vk_device, sampler_feedback_ops->vk_pipelines[i], NULL)); } +static HRESULT vkd3d_descriptor_copy_ops_init(struct vkd3d_descriptor_copy_ops *copy_ops, + struct d3d12_device *device) +{ + struct vkd3d_shader_debug_ring_spec_info debug_ring_info; + VkPushConstantRange push_range; + VkResult vr; + bool debug; + + push_range.offset = 0; + push_range.size = sizeof(struct vkd3d_descriptor_copy_meta_args); + push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + if ((vr = vkd3d_meta_create_pipeline_layout(device, + 0, NULL, + 1, &push_range, + ©_ops->vk_pipeline_layout))) + return hresult_from_vk_result(vr); + + debug = device->debug_ring.active; + + if (debug) + vkd3d_shader_debug_ring_init_spec_constant(device, &debug_ring_info, UINT64_MAX); + + /* Only called in ancillary command buffers. No need to consider descriptor buffers. */ + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_copy_descriptors), + cs_copy_descriptors, copy_ops->vk_pipeline_layout, + debug ? &debug_ring_info.spec_info : NULL, false, ©_ops->vk_pipeline))) + return hresult_from_vk_result(vr); + + return S_OK; +} + +static void vkd3d_descriptor_copy_ops_cleanup(struct vkd3d_descriptor_copy_ops *copy_ops, + struct d3d12_device *device) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VK_CALL(vkDestroyPipelineLayout(device->vk_device, copy_ops->vk_pipeline_layout, NULL)); + VK_CALL(vkDestroyPipeline(device->vk_device, copy_ops->vk_pipeline, NULL)); +} + +void vkd3d_meta_get_descriptor_copy_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_descriptor_copy_info *info) +{ + info->vk_pipeline = meta_ops->descriptor_copy.vk_pipeline; + info->vk_pipeline_layout = meta_ops->descriptor_copy.vk_pipeline_layout; +} + HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device) { HRESULT hr; @@ -1706,8 +1752,13 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device if (FAILED(hr = vkd3d_sampler_feedback_ops_init(&meta_ops->sampler_feedback, device))) goto fail_sampler_feedback; + if (FAILED(hr = vkd3d_descriptor_copy_ops_init(&meta_ops->descriptor_copy, device))) + goto fail_descriptor_copy; + return S_OK; +fail_descriptor_copy: + vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device); fail_sampler_feedback: vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device); fail_dstorage_ops: @@ -1732,6 +1783,7 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device HRESULT vkd3d_meta_ops_cleanup(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device) { + vkd3d_descriptor_copy_ops_cleanup(&meta_ops->descriptor_copy, device); vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device); vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device); vkd3d_multi_dispatch_indirect_ops_cleanup(&meta_ops->multi_dispatch_indirect, device); diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 0ec62cee79..e40073476a 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -3429,6 +3429,7 @@ HRESULT d3d12_resource_create_committed(struct d3d12_device *device, const D3D12 allocate_info.heap_desc.SizeInBytes = align(desc->Width, allocate_info.heap_desc.Alignment); allocate_info.heap_desc.Flags = heap_flags | D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS; allocate_info.vk_memory_priority = object->priority.residency_count ? vkd3d_convert_to_vk_prio(object->priority.d3d12priority) : 0.f; + allocate_info.explicit_global_buffer_usage = 0; /* Be very careful with suballocated buffers. */ if ((vkd3d_config_flags & VKD3D_CONFIG_FLAG_ZERO_MEMORY_WORKAROUNDS_COMMITTED_BUFFER_UAV) && diff --git a/libs/vkd3d/shaders/cs_copy_descriptors.comp b/libs/vkd3d/shaders/cs_copy_descriptors.comp new file mode 100644 index 0000000000..f4007e327c --- /dev/null +++ b/libs/vkd3d/shaders/cs_copy_descriptors.comp @@ -0,0 +1,96 @@ +#version 450 +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference_uvec2 : require +#extension GL_GOOGLE_include_directive : require +#include "../../../include/shader-debug/debug_channel.h" + +layout(local_size_x = 32) in; + +layout(buffer_reference_align = 4, buffer_reference, std430) readonly buffer SourceHeap +{ + uint data[]; +}; + +layout(buffer_reference_align = 4, buffer_reference, std430) writeonly buffer DestHeap +{ + uint data[]; +}; + +struct BoundHeap +{ + SourceHeap heap; + uint num_descriptors; + uint stride_words; +}; + +layout(buffer_reference_align = 64, buffer_reference, std430) readonly buffer HostMeta +{ + BoundHeap heaps[]; +}; + +struct DescriptorCopy +{ + uint src_offset; + uint dst_offset_set_index_count; +}; + +layout(buffer_reference_align = 64, buffer_reference, std430) readonly buffer CopyBuffer +{ + DescriptorCopy copies[]; +}; + +layout(push_constant, std430) uniform Registers +{ + DestHeap dst; + CopyBuffer src; + HostMeta meta; + uint num_copies; +} registers; + +void main() +{ + DEBUG_CHANNEL_INIT(gl_GlobalInvocationID); + uint index = gl_GlobalInvocationID.x; + if (index < registers.num_copies) + { + restrict DestHeap dst = registers.dst; + DescriptorCopy src_copy = registers.src.copies[index]; + uint dst_offset = bitfieldExtract(src_copy.dst_offset_set_index_count, 0, 16); + uint set_index = bitfieldExtract(src_copy.dst_offset_set_index_count, 16, 8); + uint word_count = bitfieldExtract(src_copy.dst_offset_set_index_count, 24, 8); + + DEBUG_CHANNEL_MSG(int(dst_offset), int(set_index), int(word_count)); + + BoundHeap heap_meta = registers.meta.heaps[set_index]; + bool in_bounds = src_copy.src_offset < heap_meta.num_descriptors; + + DEBUG_CHANNEL_MSG(int(src_copy.src_offset), int(heap_meta.num_descriptors)); + + if (in_bounds) + { + uint src_offset = src_copy.src_offset * heap_meta.stride_words; + // We know count is not 0. + uint i = 0; + do + { + dst.data[dst_offset] = heap_meta.heap.data[src_offset]; + i++; + dst_offset++; + src_offset++; + } while (i < word_count); + } + else + { + // memset. Technically not a valid null descriptor, but in practice, it's good enough. + // It's UB to access OOB anyways, so we should never hit this path *and* have an application actually access the descriptor. + // If we really care, we can pass down null payloads. + uint i = 0; + do + { + dst.data[dst_offset] = 0; + i++; + dst_offset++; + } while (i < word_count); + } + } +} diff --git a/libs/vkd3d/state.c b/libs/vkd3d/state.c index 42aa68f75b..c99db202df 100644 --- a/libs/vkd3d/state.c +++ b/libs/vkd3d/state.c @@ -1518,6 +1518,181 @@ HRESULT d3d12_root_signature_create_local_static_samplers_layout(struct d3d12_ro return S_OK; } +static uint32_t vkd3d_get_descriptor_size_for_type(struct d3d12_device *device, VkDescriptorType vk_descriptor_type); + +static HRESULT d3d12_root_signature_create_hoisted_descriptor_set_layout( + const struct d3d12_root_signature *root_signature, + VkShaderStageFlagBits stage, const struct vkd3d_shader_meta_hoisted_desc *descs, unsigned int num_descs, + struct d3d12_descriptor_copy_template *copy_template) +{ + struct d3d12_device *device = root_signature->device; + + VkDescriptorSetLayoutBinding bindings[VKD3D_MAX_HOISTED_DESCRIPTORS]; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + struct d3d12_descriptor_copy_template_entry *entry; + struct vkd3d_shader_descriptor_binding binding; + VkDescriptorSetLayout vk_set_layout; + VkDeviceSize desc_offset; + uint32_t bindless_flags; + unsigned int i; + HRESULT hr; + + for (i = 0; i < num_descs; i++) + { + bindings[i].descriptorType = (VkDescriptorType)descs[i].vk_descriptor_type; + bindings[i].descriptorCount = 1; + bindings[i].binding = i; + bindings[i].pImmutableSamplers = NULL; + bindings[i].stageFlags = stage; + } + + if (FAILED(hr = vkd3d_create_descriptor_set_layout(root_signature->device, 0, + num_descs, bindings, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT, + &vk_set_layout))) + return hr; + + /* Align this to simplify the algorithm that checks for exhaustion. */ + copy_template->descriptor_allocation_size = align( + copy_template->descriptor_allocation_size, + device->device_info.descriptor_buffer_properties.descriptorBufferOffsetAlignment); + copy_template->descriptor_offsets[copy_template->num_hoist_sets] = copy_template->descriptor_allocation_size; + VK_CALL(vkGetDescriptorSetLayoutSizeEXT(device->vk_device, vk_set_layout, &desc_offset)); + copy_template->descriptor_allocation_size += desc_offset; + + for (i = 0; i < num_descs; i++) + { + entry = ©_template->entries[copy_template->num_entries++]; + entry->constant_offset = descs[i].constant_offset; + entry->table_index = descs[i].table_index; + + switch (bindings[i].descriptorType) + { + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + bindless_flags = VKD3D_BINDLESS_SET_IMAGE | VKD3D_BINDLESS_SET_SRV; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + bindless_flags = VKD3D_BINDLESS_SET_IMAGE | VKD3D_BINDLESS_SET_UAV; + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + bindless_flags = VKD3D_BINDLESS_SET_BUFFER | VKD3D_BINDLESS_SET_SRV; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + bindless_flags = VKD3D_BINDLESS_SET_BUFFER | VKD3D_BINDLESS_SET_UAV; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + bindless_flags = VKD3D_BINDLESS_SET_SRV | VKD3D_BINDLESS_SET_UAV | VKD3D_BINDLESS_SET_RAW_SSBO; + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + bindless_flags = VKD3D_BINDLESS_SET_CBV; + break; + + case VK_DESCRIPTOR_TYPE_SAMPLER: + bindless_flags = VKD3D_BINDLESS_SET_SAMPLER; + break; + + default: + WARN("Unknown descriptor type %u.\n", bindings[i].descriptorType); + return E_INVALIDARG; + } + + if (vkd3d_bindless_state_find_binding(&device->bindless_state, bindless_flags, &binding)) + { + entry->set_index = binding.set; + entry->count = vkd3d_get_descriptor_size_for_type(root_signature->device, bindings[i].descriptorType) / sizeof(uint32_t); + } + else + return E_INVALIDARG; + + VK_CALL(vkGetDescriptorSetLayoutBindingOffsetEXT(device->vk_device, vk_set_layout, i, &desc_offset)); + entry->dst_offset_words = (copy_template->descriptor_offsets[copy_template->num_hoist_sets] + desc_offset) / sizeof(uint32_t); + } + + copy_template->vk_hoist_descriptor_set_layouts[copy_template->num_hoist_sets++] = vk_set_layout; + return S_OK; +} + +HRESULT d3d12_root_signature_create_hoisted_descriptor_layout( + const struct d3d12_root_signature *root_signature, const struct d3d12_bind_point_layout *layout, + VkShaderStageFlagBits first_stage, const struct vkd3d_shader_meta_hoisted_desc *first_set, unsigned int first_set_count, + VkShaderStageFlagBits second_stage, const struct vkd3d_shader_meta_hoisted_desc *second_set, unsigned int second_set_count, + struct d3d12_descriptor_copy_template *copy_template) +{ + VkDescriptorSetLayout set_layouts[VKD3D_MAX_DESCRIPTOR_SETS + VKD3D_MAX_HOIST_SHADER_STAGES]; + unsigned int num_sets; + uint64_t table_mask; + unsigned int i; + HRESULT hr; + + if (first_set_count == 0 && second_set_count == 0) + { + INFO ("Hoisting 0 descriptors!\n"); + return S_OK; + } + + copy_template->first_hoist_set_index = layout->num_set_layouts; + + /* Have to create the gap descriptor set layout. */ + if (FAILED(hr = d3d12_root_signature_create_hoisted_descriptor_set_layout(root_signature, + first_stage, first_set, first_set_count, + copy_template))) + return hr; + + if (second_set_count && FAILED(hr = d3d12_root_signature_create_hoisted_descriptor_set_layout(root_signature, + second_stage, second_set, second_set_count, + copy_template))) + return hr; + + for (i = 0; i < layout->num_set_layouts; i++) + set_layouts[i] = root_signature->set_layouts[i]; + + num_sets = layout->num_set_layouts; + + for (i = 0; i < copy_template->num_hoist_sets; i++) + set_layouts[num_sets++] = copy_template->vk_hoist_descriptor_set_layouts[i]; + + if (FAILED(hr = vkd3d_create_pipeline_layout(root_signature->device, + num_sets, set_layouts, + layout->push_constant_range.stageFlags ? 1 : 0, &layout->push_constant_range, + ©_template->vk_hoist_descriptor_layout))) + return hr; + + table_mask = 0; + for (i = 0; i < copy_template->num_entries; i++) + table_mask |= 1ull << copy_template->entries[i].table_index; + + /* Compute a mask for which root parameter indices we're sensitive to, + * so we don't flush hoisted buffer descriptors more than we need to. + * Use root parameter index rather than table index since it's slower to look up table index + * than just checking root parameter directly. */ + for (i = 0; i < root_signature->parameter_count; i++) + { + if (root_signature->parameters[i].parameter_type == D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE && + (table_mask & (1ull << root_signature->parameters[i].descriptor_table.table_index))) + { + copy_template->hoist_root_parameter_index_mask |= 1ull << i; + } + } + + /* If the first set is empty, don't push an offset to it. + * VVL gets a bit confused when you push a buffer offset to an empty set layout. */ + if (first_set_count == 0) + { + assert(copy_template->num_hoist_sets == 2); + copy_template->num_hoist_sets -= 1; + copy_template->first_hoist_set_index += 1; + copy_template->descriptor_offsets[0] = copy_template->descriptor_offsets[1]; + } + + INFO("Hoisting %u descriptors!\n", copy_template->num_entries); + + return S_OK; +} + static HRESULT d3d12_root_signature_init(struct d3d12_root_signature *root_signature, struct d3d12_device *device, const D3D12_ROOT_SIGNATURE_DESC2 *desc) { @@ -2248,6 +2423,7 @@ void d3d12_pipeline_state_dec_ref(struct d3d12_pipeline_state *state) struct d3d12_device *device = state->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; ULONG refcount = InterlockedDecrement(&state->internal_refcount); + unsigned int i; if (!refcount) { @@ -2265,6 +2441,10 @@ void d3d12_pipeline_state_dec_ref(struct d3d12_pipeline_state *state) if (state->root_signature) d3d12_root_signature_dec_ref(state->root_signature); + for (i = 0; i < ARRAY_SIZE(state->hoist_template.vk_hoist_descriptor_set_layouts); i++) + VK_CALL(vkDestroyDescriptorSetLayout(device->vk_device, state->hoist_template.vk_hoist_descriptor_set_layouts[i], NULL)); + VK_CALL(vkDestroyPipelineLayout(device->vk_device, state->hoist_template.vk_hoist_descriptor_layout, NULL)); + if (state->pipeline_type == VKD3D_PIPELINE_TYPE_GRAPHICS || state->pipeline_type == VKD3D_PIPELINE_TYPE_MESH_GRAPHICS) d3d12_pipeline_state_free_cached_desc(&state->graphics.cached_desc); rwlock_destroy(&state->lock); @@ -2435,6 +2615,8 @@ static void d3d12_pipeline_state_init_shader_interface(struct d3d12_pipeline_sta struct vkd3d_shader_interface_info *shader_interface) { const struct d3d12_root_signature *root_signature = state->root_signature; + const struct d3d12_bind_point_layout *layout; + memset(shader_interface, 0, sizeof(*shader_interface)); shader_interface->flags = d3d12_root_signature_get_shader_interface_flags(root_signature, state->pipeline_type); shader_interface->min_ssbo_alignment = d3d12_device_get_ssbo_alignment(device); @@ -2455,6 +2637,22 @@ static void d3d12_pipeline_state_init_shader_interface(struct d3d12_pipeline_sta shader_interface->descriptor_size_sampler = d3d12_device_get_descriptor_handle_increment_size( device, D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); + if (device->bindless_state.flags & VKD3D_BINDLESS_HOIST_DESCRIPTOR_BUFFER) + { + /* Ignore tess, geom and task, which are all quite rare. + * Only bother with the common cases to keep number of sets down. */ + if (stage == VK_SHADER_STAGE_VERTEX_BIT || stage == VK_SHADER_STAGE_MESH_BIT_EXT || + stage == VK_SHADER_STAGE_FRAGMENT_BIT || stage == VK_SHADER_STAGE_COMPUTE_BIT) + { + layout = d3d12_root_signature_get_layout(root_signature, state->pipeline_type); + shader_interface->flags |= VKD3D_SHADER_INTERFACE_HOIST_DESCRIPTORS; + shader_interface->hoist_descriptor_set_index = layout->num_set_layouts; + /* Make the set only depend on the root signature. */ + if (stage == VK_SHADER_STAGE_FRAGMENT_BIT) + shader_interface->hoist_descriptor_set_index++; + } + } + if (stage == VK_SHADER_STAGE_MESH_BIT_EXT) { shader_interface->stage_output_map = &state->graphics.cached_desc.stage_io_map_ms_ps; @@ -2756,6 +2954,42 @@ static void vkd3d_report_pipeline_creation_feedback_results(const VkPipelineCrea } } +static HRESULT d3d12_pipeline_state_create_hoisted_pipeline_layout(struct d3d12_pipeline_state *state) +{ + const struct vkd3d_shader_meta *metas[2] = { NULL }; + VkShaderStageFlagBits stages[2] = { 0 }; + unsigned int i; + + if (state->pipeline_type == VKD3D_PIPELINE_TYPE_COMPUTE) + { + metas[0] = &state->compute.code.meta; + stages[0] = VK_SHADER_STAGE_COMPUTE_BIT; + } + else + { + for (i = 0; i < state->graphics.stage_count; i++) + { + if (state->graphics.stages[i].stage == VK_SHADER_STAGE_VERTEX_BIT || + state->graphics.stages[i].stage == VK_SHADER_STAGE_MESH_BIT_EXT) + { + stages[0] = state->graphics.stages[i].stage; + metas[0] = &state->graphics.code[i].meta; + } + else if (state->graphics.stages[i].stage == VK_SHADER_STAGE_FRAGMENT_BIT) + { + stages[1] = state->graphics.stages[i].stage; + metas[1] = &state->graphics.code[i].meta; + } + } + } + + return d3d12_root_signature_create_hoisted_descriptor_layout(state->root_signature, + d3d12_root_signature_get_layout(state->root_signature, state->pipeline_type), + stages[0], metas[0] ? metas[0]->hoist_desc : NULL, metas[0] ? metas[0]->num_hoisted_descriptors : 0, + stages[1], metas[1] ? metas[1]->hoist_desc : NULL, metas[1] ? metas[1]->num_hoisted_descriptors : 0, + &state->hoist_template); +} + static HRESULT vkd3d_create_compute_pipeline(struct d3d12_pipeline_state *state, struct d3d12_device *device, const D3D12_SHADER_BYTECODE *code) @@ -2808,7 +3042,14 @@ static HRESULT vkd3d_create_compute_pipeline(struct d3d12_pipeline_state *state, &state->compute.identifier)); } - pipeline_info.layout = state->root_signature->compute.vk_pipeline_layout; + if (FAILED(hr = d3d12_pipeline_state_create_hoisted_pipeline_layout(state))) + return hr; + + if (state->hoist_template.vk_hoist_descriptor_layout) + pipeline_info.layout = state->hoist_template.vk_hoist_descriptor_layout; + else + pipeline_info.layout = state->root_signature->compute.vk_pipeline_layout; + pipeline_info.basePipelineHandle = VK_NULL_HANDLE; pipeline_info.basePipelineIndex = -1; @@ -4718,6 +4959,10 @@ static HRESULT d3d12_pipeline_state_init_graphics_spirv(struct d3d12_pipeline_st /* At this point, we will have valid meta structures set up. * Deduce further PSO information from these structs. */ d3d12_pipeline_state_graphics_handle_meta(state, device); + + if (FAILED(hr = d3d12_pipeline_state_create_hoisted_pipeline_layout(state))) + return hr; + return S_OK; } @@ -4773,6 +5018,10 @@ static HRESULT d3d12_pipeline_state_init_static_pipeline(struct d3d12_pipeline_s graphics->pipeline_layout = state->root_signature->graphics.vk_pipeline_layout; } + /* Override the pipeline layout if we hoist descriptor buffers. */ + if (state->hoist_template.vk_hoist_descriptor_layout) + graphics->pipeline_layout = state->hoist_template.vk_hoist_descriptor_layout; + graphics->pipeline = VK_NULL_HANDLE; graphics->library = VK_NULL_HANDLE; graphics->library_flags = 0; @@ -5992,6 +6241,43 @@ static bool vkd3d_bindless_supports_embedded_packed_metadata(struct d3d12_device vkd3d_bindless_get_mutable_descriptor_type_size(device); } +static bool vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(uint32_t size) +{ + return size % 4 == 0 && size >= 4; +} + +static bool vkd3d_bindless_supports_descriptor_buffer_hoisting(struct d3d12_device *device, uint32_t flags) +{ + const VkPhysicalDeviceDescriptorBufferPropertiesEXT *props; + if (d3d12_device_uses_descriptor_buffers(device) && + (vkd3d_config_flags & VKD3D_CONFIG_FLAG_DESCRIPTOR_HOISTING) && + (flags & VKD3D_BINDLESS_RAW_SSBO) && + !(flags & VKD3D_SSBO_OFFSET_BUFFER)) + { + props = &device->device_info.descriptor_buffer_properties; + if (!vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->samplerDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->sampledImageDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->storageImageDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->robustStorageBufferDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->robustUniformBufferDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->robustUniformTexelBufferDescriptorSize) || + !vkd3d_bindless_descriptor_size_supports_descriptor_buffer_hoisting(props->robustStorageTexelBufferDescriptorSize)) + return false; + + if (props->maxDescriptorBufferBindings < 3 || + props->maxResourceDescriptorBufferBindings < 2 || + props->maxSamplerDescriptorBufferBindings < 2 || + !props->bufferlessPushDescriptors) + return false; + + /* If we pass all these tests, we're on an implementation that does not really care if we rebind descriptor buffers, + * because we're not using true descriptor heaps. */ + return true; + } + else + return false; +} + bool vkd3d_bindless_supports_embedded_mutable_type(struct d3d12_device *device, uint32_t flags) { const VkPhysicalDeviceDescriptorBufferPropertiesEXT *props = &device->device_info.descriptor_buffer_properties; @@ -6284,6 +6570,9 @@ static uint32_t vkd3d_bindless_state_get_bindless_flags(struct d3d12_device *dev flags |= VKD3D_BINDLESS_MUTABLE_TYPE_SPLIT_RAW_TYPED; } + if (vkd3d_bindless_supports_descriptor_buffer_hoisting(device, flags)) + flags |= VKD3D_BINDLESS_HOIST_DESCRIPTOR_BUFFER; + return flags; } @@ -6574,42 +6863,6 @@ bool vkd3d_bindless_state_find_binding(const struct vkd3d_bindless_state *bindle return false; } -struct vkd3d_descriptor_binding vkd3d_bindless_state_find_set(const struct vkd3d_bindless_state *bindless_state, uint32_t flags) -{ - struct vkd3d_descriptor_binding binding; - D3D12_DESCRIPTOR_HEAP_TYPE heap_type; - unsigned int i, set_index = 0; - - heap_type = flags & VKD3D_BINDLESS_SET_SAMPLER - ? D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER - : D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - - for (i = 0; i < bindless_state->set_count; i++) - { - const struct vkd3d_bindless_set_info *set_info = &bindless_state->set_info[i]; - - if (set_info->heap_type == heap_type) - { - if ((set_info->flags & flags) == flags) - { - binding.set = set_index; - binding.binding = set_info->binding_index; - - if (flags & VKD3D_BINDLESS_SET_EXTRA_MASK) - binding.binding = vkd3d_bindless_state_get_extra_binding_index(flags, set_info->flags); - return binding; - } - - set_index++; - } - } - - ERR("No set found for flags %#x.", flags); - binding.set = 0; - binding.binding = 0; - return binding; -} - uint32_t vkd3d_bindless_state_find_set_info_index(const struct vkd3d_bindless_state *bindless_state, uint32_t flags) { D3D12_DESCRIPTOR_HEAP_TYPE heap_type; diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 124cd75a35..b4191d9cc1 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -653,6 +653,7 @@ struct vkd3d_allocate_heap_memory_info void *host_ptr; uint32_t extra_allocation_flags; float vk_memory_priority; + VkBufferUsageFlags explicit_global_buffer_usage; }; struct vkd3d_allocate_resource_memory_info @@ -1663,7 +1664,6 @@ struct d3d12_bind_point_layout VkPushConstantRange push_constant_range; }; -#define VKD3D_MAX_HOISTED_DESCRIPTORS 16 struct vkd3d_descriptor_hoist_desc { uint32_t table_index; @@ -1758,6 +1758,16 @@ unsigned int d3d12_root_signature_get_shader_interface_flags(const struct d3d12_ enum vkd3d_pipeline_type pipeline_type); HRESULT d3d12_root_signature_create_local_static_samplers_layout(struct d3d12_root_signature *root_signature, VkDescriptorSetLayout vk_set_layout, VkPipelineLayout *vk_pipeline_layout); + +struct d3d12_descriptor_copy_template; +HRESULT d3d12_root_signature_create_hoisted_descriptor_layout( + const struct d3d12_root_signature *root_signature, const struct d3d12_bind_point_layout *layout, + VkShaderStageFlagBits first_stage, const struct vkd3d_shader_meta_hoisted_desc *first_set, + unsigned int first_set_count, + VkShaderStageFlagBits second_stage, const struct vkd3d_shader_meta_hoisted_desc *second_set, + unsigned int second_set_count, + struct d3d12_descriptor_copy_template *copy_template); + HRESULT vkd3d_create_pipeline_layout(struct d3d12_device *device, unsigned int set_layout_count, const VkDescriptorSetLayout *set_layouts, unsigned int push_constant_count, const VkPushConstantRange *push_constants, @@ -2023,6 +2033,43 @@ struct vkd3d_pipeline_cache_compatibility uint64_t dxbc_blob_hashes[VKD3D_MAX_SHADER_STAGES]; }; +struct d3d12_descriptor_copy_template_entry +{ + uint16_t dst_offset_words; + uint8_t constant_offset; + uint8_t table_index; + uint8_t set_index; + uint8_t count; +}; + +/* Common case is VS + PS, MS + PS, CS. + * Ignore hoisting outside these (tess/geom/task) to keep memory usage low and number of sets. */ +#define VKD3D_MAX_HOIST_SHADER_STAGES 2 + +struct d3d12_descriptor_copy_template +{ + struct d3d12_descriptor_copy_template_entry entries[VKD3D_MAX_HOIST_SHADER_STAGES * VKD3D_MAX_HOISTED_DESCRIPTORS]; + unsigned int num_entries; + + /* For descriptor hoisting. Each pipeline stage gets its own set. */ + VkDescriptorSetLayout vk_hoist_descriptor_set_layouts[VKD3D_MAX_HOIST_SHADER_STAGES]; + VkPipelineLayout vk_hoist_descriptor_layout; + + /* On draw time if table offsets are out of date: + * - Allocate descriptor_allocation_words from d3d12_command_list_descriptor_copy_batch::descriptor_buffer. + * - Allocate num_entries from d3d12_command_list_descriptor_copy_batch::host_buffer. + * - vkCmdBindDescriptorBufferOffsets(offsets = descriptor_offsets + alloc offset, + * first_hoist_set_index, num_hoist_sets). + * - Resolve constant_offset / table_index from command list state into src_offset. + * - Resolve dst_offset_words + alloc offset into dst_offset. + * - Copy set_index / count. */ + VkDeviceSize descriptor_offsets[VKD3D_MAX_HOIST_SHADER_STAGES]; + VkDeviceSize descriptor_allocation_size; + unsigned int num_hoist_sets; + unsigned int first_hoist_set_index; + uint64_t hoist_root_parameter_index_mask; +}; + /* ID3D12PipelineState */ struct d3d12_pipeline_state { @@ -2047,6 +2094,8 @@ struct d3d12_pipeline_state bool pso_is_loaded_from_cached_blob; bool pso_is_fully_dynamic; + struct d3d12_descriptor_copy_template hoist_template; + struct vkd3d_private_store private_store; }; @@ -2313,6 +2362,14 @@ struct vkd3d_scratch_buffer VkDeviceSize offset; }; +struct vkd3d_scratch_allocation +{ + VkBuffer buffer; + VkDeviceSize offset; + VkDeviceAddress va; + void *host_ptr; +}; + #define VKD3D_QUERY_TYPE_INDEX_OCCLUSION (0u) #define VKD3D_QUERY_TYPE_INDEX_PIPELINE_STATISTICS (1u) #define VKD3D_QUERY_TYPE_INDEX_TRANSFORM_FEEDBACK (2u) @@ -2343,6 +2400,7 @@ enum vkd3d_scratch_pool_kind VKD3D_SCRATCH_POOL_KIND_DEVICE_STORAGE = 0, VKD3D_SCRATCH_POOL_KIND_INDIRECT_PREPROCESS, VKD3D_SCRATCH_POOL_KIND_UNIFORM_UPLOAD, + VKD3D_SCRATCH_POOL_KIND_DESCRIPTOR_BUFFER, VKD3D_SCRATCH_POOL_KIND_COUNT }; @@ -2399,9 +2457,10 @@ bool d3d12_command_allocator_allocate_query_from_type_index( enum vkd3d_pipeline_dirty_flag { - VKD3D_PIPELINE_DIRTY_STATIC_SAMPLER_SET = 0x00000001u, - VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS = 0x00000002u, - VKD3D_PIPELINE_DIRTY_HOISTED_DESCRIPTORS = 0x00000004u, + VKD3D_PIPELINE_DIRTY_STATIC_SAMPLER_SET = 0x00000001u, + VKD3D_PIPELINE_DIRTY_DESCRIPTOR_TABLE_OFFSETS = 0x00000002u, + VKD3D_PIPELINE_DIRTY_HOISTED_PUSH_DESCRIPTORS = 0x00000004u, + VKD3D_PIPELINE_DIRTY_HOISTED_BUFFER_DESCRIPTORS = 0x00000008u, }; struct vkd3d_root_descriptor_info @@ -2485,6 +2544,15 @@ enum vkd3d_initial_transition_type { VKD3D_INITIAL_TRANSITION_TYPE_RESOURCE, VKD3D_INITIAL_TRANSITION_TYPE_QUERY_HEAP, + VKD3D_INITIAL_TRANSITION_DESCRIPTOR_COPY_BATCH, +}; + +struct vkd3d_descriptor_copy_meta_args +{ + VkDeviceAddress descriptor_buffer_va; + VkDeviceAddress host_buffer_va; + VkDeviceAddress host_meta_va; + uint32_t num_copies; }; struct vkd3d_initial_transition @@ -2498,6 +2566,7 @@ struct vkd3d_initial_transition bool perform_initial_transition; } resource; struct d3d12_query_heap *query_heap; + struct vkd3d_descriptor_copy_meta_args descriptor_copy_batch; }; }; @@ -2673,12 +2742,15 @@ union vkd3d_descriptor_heap_state { struct { - VkDeviceAddress heap_va_resource; - VkDeviceAddress heap_va_sampler; + VkDeviceAddress heap_va[2]; + void *mapped[2]; + uint32_t vk_descriptor_count_for_buffer_index[2]; VkBuffer vk_buffer_resource; bool heap_dirty; - VkDeviceSize vk_offsets[VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS]; + VkDeviceSize vk_bind_offsets[VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS]; + uint32_t vk_payload_offsets[VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS]; + uint32_t vk_descriptor_stride_words[VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS]; } buffers; struct @@ -2728,6 +2800,45 @@ struct d3d12_command_list_sequence struct d3d12_command_list_iteration_indirect_meta *indirect_meta; }; +#define VKD3D_DESCRIPTOR_COPY_BATCH_DESCRIPTOR_BUFFER_SIZE (64 * 1024) +#define VKD3D_DESCRIPTOR_COPY_BATCH_NUM_COPIES ((64 * 1024) / sizeof(struct d3d12_command_list_descriptor_copy_desc)) + +/* Represents a copy where we do: + * for i in range(count): + * store_u32(descriptor_buffer + (dst_offset + i) * sizeof(uint32_t), + * load_u32(base_va[set_index] + (src_offset + i) * sizeof(uint32_t))); + */ +struct d3d12_command_list_descriptor_copy_desc +{ + uint32_t src_offset; + uint16_t dst_offset; + uint8_t set_index; + uint8_t count; +}; + +struct d3d12_command_list_descriptor_copy_heap +{ + VkDeviceAddress base_va; + uint32_t num_descriptors; + uint32_t stride_words; +}; + +/* A batch is started when application sets descriptor heap. */ +struct d3d12_command_list_descriptor_copy_batch +{ + /* Holds VKD3D_DESCRIPTOR_COPY_BATCH_WORDS * sizeof(uint32_t) worth of descriptors. */ + struct vkd3d_scratch_allocation descriptor_buffer; + /* Holds VKD3D_DESCRIPTOR_COPY_BATCH_NUM_WORD_COPIES worth of d3d12_command_list_descriptor_copy_word. */ + struct vkd3d_scratch_allocation host_buffer; + + /* Holds heaps members, VKD3D_MAX_BINDLESS_DESCRIPTOR_SETS. + * We have to dynamically index, so we cannot use push constants. */ + struct vkd3d_scratch_allocation host_meta_buffer; + + unsigned int descriptor_buffer_offset; + unsigned int num_copies; +}; + struct d3d12_command_list { d3d12_command_list_iface ID3D12GraphicsCommandList_iface; @@ -2753,6 +2864,7 @@ struct d3d12_command_list } index_buffer; struct d3d12_command_list_sequence cmd; + struct d3d12_command_list_descriptor_copy_batch descriptor_copy_batch; struct d3d12_rtv_desc rtvs[D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT]; struct d3d12_rtv_desc dsv; @@ -3564,6 +3676,7 @@ enum vkd3d_bindless_flags VKD3D_BINDLESS_MUTABLE_EMBEDDED_PACKED_METADATA = (1u << 10), VKD3D_FORCE_COMPUTE_ROOT_PARAMETERS_PUSH_UBO = (1u << 11), VKD3D_BINDLESS_MUTABLE_TYPE_SPLIT_RAW_TYPED = (1u << 12), + VKD3D_BINDLESS_HOIST_DESCRIPTOR_BUFFER = (1u << 13), }; #define VKD3D_BINDLESS_SET_MAX_EXTRA_BINDINGS 8 @@ -3644,7 +3757,6 @@ void vkd3d_bindless_state_cleanup(struct vkd3d_bindless_state *bindless_state, struct d3d12_device *device); bool vkd3d_bindless_state_find_binding(const struct vkd3d_bindless_state *bindless_state, uint32_t flags, struct vkd3d_shader_descriptor_binding *binding); -struct vkd3d_descriptor_binding vkd3d_bindless_state_find_set(const struct vkd3d_bindless_state *bindless_state, uint32_t flags); uint32_t vkd3d_bindless_state_find_set_info_index(const struct vkd3d_bindless_state *bindless_state, uint32_t flags); @@ -4052,6 +4164,18 @@ struct vkd3d_sampler_feedback_resolve_ops VkPipeline vk_pipelines[VKD3D_SAMPLER_FEEDBACK_RESOLVE_COUNT]; }; +struct vkd3d_descriptor_copy_info +{ + VkPipeline vk_pipeline; + VkPipelineLayout vk_pipeline_layout; +}; + +struct vkd3d_descriptor_copy_ops +{ + VkPipelineLayout vk_pipeline_layout; + VkPipeline vk_pipeline; +}; + struct vkd3d_meta_ops { struct d3d12_device *device; @@ -4065,6 +4189,7 @@ struct vkd3d_meta_ops struct vkd3d_multi_dispatch_indirect_ops multi_dispatch_indirect; struct vkd3d_dstorage_ops dstorage; struct vkd3d_sampler_feedback_resolve_ops sampler_feedback; + struct vkd3d_descriptor_copy_ops descriptor_copy; }; HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device); @@ -4119,6 +4244,14 @@ static inline VkExtent3D vkd3d_meta_get_sampler_feedback_workgroup_size(void) return result; } +static inline uint32_t vkd3d_meta_get_descriptor_copy_workgroup_size(void) +{ + return 32; +} + +void vkd3d_meta_get_descriptor_copy_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_descriptor_copy_info *info); + enum vkd3d_time_domain_flag { VKD3D_TIME_DOMAIN_DEVICE = 0x00000001u, diff --git a/libs/vkd3d/vkd3d_shaders.h b/libs/vkd3d/vkd3d_shaders.h index 4c99b1e7fd..2ec1919e79 100644 --- a/libs/vkd3d/vkd3d_shaders.h +++ b/libs/vkd3d/vkd3d_shaders.h @@ -66,5 +66,6 @@ enum vkd3d_meta_copy_mode #include #include #include +#include #endif /* __VKD3D_SPV_SHADERS_H */