From e5ee75396f620f9facf910fb08d51b7634738b33 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Thu, 2 Nov 2023 17:27:03 -0700 Subject: [PATCH 01/15] Remove use of dynamic_cast. (#7931) Remove use of dynamic_cast to preserve compiling the Halide compiler without RTTI. --- src/StmtToHTML.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp index 7539da2a1064..7c8c9f9c03c7 100644 --- a/src/StmtToHTML.cpp +++ b/src/StmtToHTML.cpp @@ -1306,7 +1306,8 @@ class HTMLCodePrinter : public IRVisitor { int max_block_cost = cost_model.get_max_compute_cost(true); int line_cost = cost_model.get_compute_cost(op, false); int block_cost = cost_model.get_compute_cost(op, true); - if (dynamic_cast(op) || dynamic_cast(op)) { + if ((op != nullptr) && + ((op->node_type == IRNodeType::LetStmt) || op->node_type == IRNodeType::Allocate)) { block_cost = line_cost; } std::string _id = "cc-" + std::to_string(id); @@ -1319,7 +1320,8 @@ class HTMLCodePrinter : public IRVisitor { int max_block_cost = cost_model.get_max_data_movement_cost(true); int line_cost = cost_model.get_data_movement_cost(op, false); int block_cost = cost_model.get_data_movement_cost(op, true); - if (dynamic_cast(op) || dynamic_cast(op)) { + if ((op != nullptr) && + ((op->node_type == IRNodeType::LetStmt) || op->node_type == IRNodeType::Allocate)) { block_cost = line_cost; } std::string _id = "dc-" + std::to_string(id); From e5bf7ab06fd861ecb759c4b7e0bf900fb88e39f2 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Tue, 7 Nov 2023 07:36:56 +0800 Subject: [PATCH 02/15] Add special build for testing serialization via a serialization roundtrip in JIT compilation and fix serialization leaks (#7763) * add back JIT testing, enclosed in #ifdef blocks * fix typo * nits * WITH_SERIALIZATION_JIT->WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING * fix self-reference leaks: now uses weak function ptr in reverse function mappings * Move clang-tidy checks back to Linux Recent changes in the GHA runners for macOS don't play well with clang-tidy; rather than sink any more time into debugging it, I'm going to revert the relevant parts of #7746 so that it runs on the less-finicky Linux runners instead. * bogus * Update Generator.cpp * Update Generator.cpp * call copy_to_host before serializing buffers * throw an error if we serialize on-device buffer * Skip specialize_to_gpu * Update Pipeline.cpp * Skip two more tests * use serialize to memory during jit testing * makefile update * makefile fix * skip the tutorial if flatc is not there * fix * fix signature * fix makefile * trigger buildbot --------- Co-authored-by: Steven Johnson --- Makefile | 6 +++++ cmake/HalideTestHelpers.cmake | 7 ++++++ src/CMakeLists.txt | 9 ++++++++ src/Deserialization.cpp | 7 +++++- src/Pipeline.cpp | 23 +++++++++++++++++++ src/Serialization.cpp | 10 +++++--- ..._give_input_buffers_device_allocations.cpp | 5 ++++ test/correctness/leak_device_memory.cpp | 4 ++++ test/correctness/specialize_to_gpu.cpp | 5 ++++ 9 files changed, 72 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 200742525cc5..67a0818b176a 100644 --- a/Makefile +++ b/Makefile @@ -2083,6 +2083,12 @@ tutorial_%: $(BIN_DIR)/tutorial_% $(TMP_DIR)/images/rgb.png $(TMP_DIR)/images/gr cd $(TMP_DIR) ; $(CURDIR)/$< @-echo +# Skip the serialization tutorial, if we didn't build -DWITH_SERIALIZATION +ifeq (,$(shell which flatc)) +tutorial_lesson_23_serialization: + @echo "Skipping tutorial lesson 23 (serialization not enabled) ..." +endif + test_mullapudi2016: $(MULLAPUDI2016_TESTS:$(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp=mullapudi2016_%) mullapudi2016_%: $(BIN_DIR)/mullapudi2016_% $(BIN_MULLAPUDI2016) diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake index c23aba75fea6..e938d11d53ec 100644 --- a/cmake/HalideTestHelpers.cmake +++ b/cmake/HalideTestHelpers.cmake @@ -77,6 +77,13 @@ function(add_halide_test TARGET) CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN TRUE) + + if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING) + if (WITH_SERIALIZATION) + target_compile_definitions(${TARGET} PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING) + endif () + endif () + # Add a meta-target for each group, to allow us to build by group easily foreach (GROUP IN LISTS args_GROUPS) set(META_TARGET build_${GROUP}) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d5d6a8a3832e..74e44de3c163 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -511,6 +511,15 @@ if (WITH_SERIALIZATION) target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION) endif () +# Enable serialization testing by intercepting JIT compilation with a serialization roundtrip; +# This is used only for special builds made specifically for testing, and must be disabled by default. +option(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING "Intercepting JIT compilation with a serialization roundtrip, for test only" OFF) +if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING) + if (WITH_SERIALIZATION) + target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING) + endif () +endif () + add_library(Halide::Halide ALIAS Halide) target_link_libraries(Halide PRIVATE Halide::LLVM) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 5d3979fc7f52..eda8ad93338b 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1321,7 +1321,12 @@ void Deserializer::build_reverse_function_mappings(const std::vector & } int count = 0; for (const auto &f : functions) { - this->reverse_function_mappings[count++] = f.get_contents(); + // The reverse function mappings are used in places where only weak references are needed. + FunctionPtr ptr; + ptr.strong = nullptr; + ptr.weak = f.get_contents().group(); + ptr.idx = f.get_contents().idx; + this->reverse_function_mappings[count++] = ptr; } } diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index 631033404137..c605d2038248 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -581,6 +581,24 @@ void Pipeline::compile_jit(const Target &target_arg) { // Clear all cached info in case there is an error. contents->invalidate_cache(); +#ifdef WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING + std::map external_params; + std::vector data; + serialize_pipeline(*this, data, external_params); + Pipeline deserialized_pipe = deserialize_pipeline(data, external_params); + std::vector outputs; + for (const Func &f : deserialized_pipe.outputs()) { + outputs.push_back(f.function()); + } + // We save the original output functions and requirements, + // and restore them once all lowering is done, + // so that reschedule/reorder storage can be properly handled. + std::vector origin_outputs = contents->outputs; + std::vector origin_requirements = contents->requirements; + contents->outputs = outputs; + contents->requirements = deserialized_pipe.requirements(); +#endif + // Infer an arguments vector infer_arguments(); @@ -596,6 +614,11 @@ void Pipeline::compile_jit(const Target &target_arg) { Module module = compile_to_module(args, generate_function_name(), target).resolve_submodules(); std::map lowered_externs = contents->jit_externs; contents->jit_cache = compile_jit_cache(module, std::move(args), contents->outputs, contents->jit_externs, target); +#ifdef WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING + // Restore the original outputs and requirements. + contents->outputs = origin_outputs; + contents->requirements = origin_requirements; +#endif } Callable Pipeline::compile_to_callable(const std::vector &args_in, const Target &target_arg) { diff --git a/src/Serialization.cpp b/src/Serialization.cpp index f38f79c5464a..038d5a1323e0 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -129,7 +129,7 @@ class Serializer { Offset serialize_extern_func_argument(FlatBufferBuilder &builder, const ExternFuncArgument &extern_func_argument); - Offset serialize_buffer(FlatBufferBuilder &builder, const Buffer<> &buffer); + Offset serialize_buffer(FlatBufferBuilder &builder, Buffer<> buffer); std::vector> serialize_wrapper_refs(FlatBufferBuilder &builder, const std::map &wrappers); @@ -1380,10 +1380,14 @@ Offset Serializer::serialize_extern_func_argument } } -Offset Serializer::serialize_buffer(FlatBufferBuilder &builder, const Buffer<> &buffer) { +Offset Serializer::serialize_buffer(FlatBufferBuilder &builder, Buffer<> buffer) { if (!buffer.defined()) { return Serialize::CreateBuffer(builder, false); } + if (buffer.device_dirty()) { + user_error << "Cannot serialize on-device buffer: " << buffer.name() << "\n"; + } + buffer.copy_to_host(); const auto name_serialized = serialize_string(builder, buffer.name()); const auto type_serialized = serialize_type(builder, buffer.type()); const int32_t dimensions = buffer.dimensions(); @@ -1475,7 +1479,7 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul std::vector> buffers_serialized; buffers_serialized.reserve(buffers_in_pipeline.size()); - for (const auto &buffer : buffers_in_pipeline) { + for (auto &buffer : buffers_in_pipeline) { buffers_serialized.push_back(serialize_buffer(builder, buffer.second)); } diff --git a/test/correctness/gpu_give_input_buffers_device_allocations.cpp b/test/correctness/gpu_give_input_buffers_device_allocations.cpp index a2f4d9618f63..666ce86d9b3f 100644 --- a/test/correctness/gpu_give_input_buffers_device_allocations.cpp +++ b/test/correctness/gpu_give_input_buffers_device_allocations.cpp @@ -4,6 +4,11 @@ using namespace Halide; int main(int argc, char **argv) { +#ifdef WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING + printf("[SKIP] Serialization won't preserve GPU buffers, skipping.\n"); + return 0; +#endif + Target t(get_jit_target_from_environment()); if (!t.has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); diff --git a/test/correctness/leak_device_memory.cpp b/test/correctness/leak_device_memory.cpp index 086bb1cd5810..567aeddb5fd8 100644 --- a/test/correctness/leak_device_memory.cpp +++ b/test/correctness/leak_device_memory.cpp @@ -14,6 +14,10 @@ void halide_print(JITUserContext *user_context, const char *str) { } int main(int argc, char **argv) { +#ifdef WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING + printf("[SKIP] Serialization won't preserve GPU buffers, skipping.\n"); + return 0; +#endif Target target = get_jit_target_from_environment(); diff --git a/test/correctness/specialize_to_gpu.cpp b/test/correctness/specialize_to_gpu.cpp index 0890e2ad6eae..8e9644114c6f 100644 --- a/test/correctness/specialize_to_gpu.cpp +++ b/test/correctness/specialize_to_gpu.cpp @@ -4,6 +4,11 @@ using namespace Halide; int main(int argc, char **argv) { +#ifdef WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING + printf("[SKIP] Serialization won't preserve GPU buffers, skipping.\n"); + return 0; +#endif + if (!get_jit_target_from_environment().has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); return 0; From 256c2f237683b939c6a45d2d5f48569d0d6c6135 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Wed, 8 Nov 2023 01:57:21 +0800 Subject: [PATCH 03/15] Add missing serialization of Dim::partition_policy (#7935) add missing serialization of Dim::partition_policy --- src/Deserialization.cpp | 2 ++ src/Serialization.cpp | 3 ++- src/halide_ir.fbs | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index eda8ad93338b..bb19cf82c9aa 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1110,11 +1110,13 @@ Dim Deserializer::deserialize_dim(const Serialize::Dim *dim) { const auto for_type = deserialize_for_type(dim->for_type()); const auto device_api = deserialize_device_api(dim->device_api()); const auto dim_type = deserialize_dim_type(dim->dim_type()); + const auto partition_policy = deserialize_partition(dim->partition_policy()); auto hl_dim = Dim(); hl_dim.var = var; hl_dim.for_type = for_type; hl_dim.device_api = device_api; hl_dim.dim_type = dim_type; + hl_dim.partition_policy = partition_policy; return hl_dim; } diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 038d5a1323e0..c85eaa15e1aa 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1227,7 +1227,8 @@ Offset Serializer::serialize_dim(FlatBufferBuilder &builder, con const auto for_type_serialized = serialize_for_type(dim.for_type); const auto device_api_serialized = serialize_device_api(dim.device_api); const auto dim_type_serialized = serialize_dim_type(dim.dim_type); - return Serialize::CreateDim(builder, var_serialized, for_type_serialized, device_api_serialized, dim_type_serialized); + const auto partition_policy_serialized = serialize_partition(dim.partition_policy); + return Serialize::CreateDim(builder, var_serialized, for_type_serialized, device_api_serialized, dim_type_serialized, partition_policy_serialized); } Offset Serializer::serialize_fuse_loop_level(FlatBufferBuilder &builder, const FuseLoopLevel &fuse_loop_level) { diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index f0ad94d8bdf6..f3d27e83a62a 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -558,6 +558,7 @@ table Dim { for_type: ForType; device_api: DeviceAPI; dim_type: DimType; + partition_policy: Partition; } enum LoopAlignStrategy: ubyte { From 3b4dc332a5fb5d80c6a5702641fa781a0e030171 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Tue, 7 Nov 2023 13:23:31 -0800 Subject: [PATCH 04/15] Make sure all Halide arithmetic scalar types can be named from the Generator interface. (#7934) * Make sure all Halide arithmetic scalar types can be named from the Generator interface. Specifically adding 64-bit signed and unsigned integers and making sure float16 and bfloat16 are fully supported and documented. Add a simple test for all the type names. (Don't use float16 and bfloat16 in the arithmetic as they do not compile with the C++ backend. The name mapping should still be tested but the types passed do not seem to be checked as the values are not used.) --- Makefile | 4 +- src/Generator.cpp | 5 +- src/Generator.h | 4 ++ test/generator/CMakeLists.txt | 5 ++ test/generator/all_type_names_aottest.cpp | 58 +++++++++++++++++++++ test/generator/all_type_names_generator.cpp | 53 +++++++++++++++++++ 6 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 test/generator/all_type_names_aottest.cpp create mode 100644 test/generator/all_type_names_generator.cpp diff --git a/Makefile b/Makefile index 67a0818b176a..5c5a3593ee49 100644 --- a/Makefile +++ b/Makefile @@ -2473,7 +2473,7 @@ $(DISTRIB_DIR)/bin/featurization_to_sample $(DISTRIB_DIR)/bin/get_host_target: $ @mkdir -p $(@D) $(MAKE) -f $(SRC_DIR)/autoschedulers/common/Makefile $(BIN_DIR)/featurization_to_sample $(BIN_DIR)/get_host_target HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) for TOOL in featurization_to_sample get_host_target; do \ - cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ + cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ done # Adams2019 also includes autotuning tools @@ -2482,7 +2482,7 @@ $(DISTRIB_DIR)/lib/libautoschedule_adams2019.$(PLUGIN_EXT): $(BIN_DIR)/libautosc $(MAKE) -f $(SRC_DIR)/autoschedulers/adams2019/Makefile $(BIN_DIR)/adams2019_retrain_cost_model $(BIN_DIR)/adams2019_weightsdir_to_weightsfile HALIDE_DISTRIB_PATH=$(CURDIR)/$(DISTRIB_DIR) cp $< $(DISTRIB_DIR)/lib/ for TOOL in adams2019_retrain_cost_model adams2019_weightsdir_to_weightsfile; do \ - cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ + cp $(BIN_DIR)/$${TOOL} $(DISTRIB_DIR)/bin/; \ done cp $(SRC_DIR)/autoschedulers/adams2019/adams2019_autotune_loop.sh $(DISTRIB_DIR)/tools/ ifeq ($(UNAME), Darwin) diff --git a/src/Generator.cpp b/src/Generator.cpp index 5228a9c2d918..8b633b777dd0 100644 --- a/src/Generator.cpp +++ b/src/Generator.cpp @@ -590,12 +590,15 @@ const std::map &get_halide_type_enum_map() { {"int8", Int(8)}, {"int16", Int(16)}, {"int32", Int(32)}, + {"int64", Int(64)}, {"uint8", UInt(8)}, {"uint16", UInt(16)}, {"uint32", UInt(32)}, + {"uint64", UInt(64)}, {"float16", Float(16)}, {"float32", Float(32)}, - {"float64", Float(64)}}; + {"float64", Float(64)}, + {"bfloat16", BFloat(16)}}; return halide_type_enum_map; } diff --git a/src/Generator.h b/src/Generator.h index 9bc335b52ed7..78357d59a156 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -973,11 +973,15 @@ using GeneratorParamImplBase = * "int8" Halide::Int(8) * "int16" Halide::Int(16) * "int32" Halide::Int(32) + * "int64" Halide::Int(64) * "uint8" Halide::UInt(8) * "uint16" Halide::UInt(16) * "uint32" Halide::UInt(32) + * "uint64" Halide::UInt(64) + * "float16" Halide::Float(16) * "float32" Halide::Float(32) * "float64" Halide::Float(64) + * "bfloat16" Halide::BFloat(16) * * No vector Types are currently supported by this mapping. * diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index 5b549eefd294..fc1cbfc76e78 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -248,6 +248,11 @@ endforeach () _add_halide_aot_tests(alias HALIDE_LIBRARIES alias ${EXTRA_ALIAS_LIBS}) +# all_type_names_aottest.cpp +# all_type_names_generator.cpp +_add_halide_libraries(all_type_names) +_add_halide_aot_tests(all_type_names) + # argvcall_aottest.cpp # argvcall_generator.cpp _add_halide_libraries(argvcall) diff --git a/test/generator/all_type_names_aottest.cpp b/test/generator/all_type_names_aottest.cpp new file mode 100644 index 000000000000..2c54dad1ebbf --- /dev/null +++ b/test/generator/all_type_names_aottest.cpp @@ -0,0 +1,58 @@ +#include "HalideBuffer.h" +#include "HalideRuntime.h" + +#include +#include + +#include "all_type_names.h" + +using namespace Halide::Runtime; + +const int kSize = 32; + +int main(int argc, char **argv) { + int32_t result; + + Buffer input_i8(kSize); + Buffer input_i16(kSize); + Buffer input_i32(kSize); + Buffer input_i64(kSize); + Buffer input_u8(kSize); + Buffer input_u16(kSize); + Buffer input_u32(kSize); + Buffer input_u64(kSize); + Buffer input_f16(kSize); + Buffer input_f32(kSize); + Buffer input_f64(kSize); + Buffer input_bf16(kSize); + Buffer output(kSize); + + input_i8.fill(1); + input_i16.fill(1); + input_i32.fill(1); + input_i64.fill(1); + input_u8.fill(1); + input_u16.fill(1); + input_u32.fill(1); + input_u64.fill(1); + // Start with a u16 Buffer so it can be initialized then convert to float16. + input_f16.fill(0x3C00); + input_f16.raw_buffer()->type.code = halide_type_float; + input_f32.fill(1.0f); + input_f64.fill(1.0); + // Start with a u16 Buffer so it can be initialized then convert to bfloat16. + input_bf16.fill(0x3F80); + input_bf16.raw_buffer()->type.code = halide_type_bfloat; + + result = all_type_names(input_i8, input_i16, input_i32, input_i64, + input_u8, input_u16, input_u32, input_u64, + input_f16, input_f32, input_f64, input_bf16, + output); + assert(result == 0); + output.for_each_element([=](int x) { + assert(output(x) == 10.0); + }); + + printf("Success!\n"); + return 0; +} diff --git a/test/generator/all_type_names_generator.cpp b/test/generator/all_type_names_generator.cpp new file mode 100644 index 000000000000..bd15c034f18e --- /dev/null +++ b/test/generator/all_type_names_generator.cpp @@ -0,0 +1,53 @@ +#include "Halide.h" + +namespace { + +class AllTypeNamesGeneric : public Halide::Generator { +public: + Input input_i8{"input_i8", 1}; + Input input_i16{"input_i16", 1}; + Input input_i32{"input_i32", 1}; + Input input_i64{"input_i64", 1}; + Input input_u8{"input_u8", 1}; + Input input_u16{"input_u16", 1}; + Input input_u32{"input_u32", 1}; + Input input_u64{"input_u64", 1}; + Input input_f16{"input_f16", 1}; + Input input_f32{"input_f32", 1}; + Input input_f64{"input_f64", 1}; + Input input_bf16{"input_bf16", 1}; + Output output{"output", 1}; + + void generate() { + Var x; + + // Don't use float16 and bfloat16 arguments as they do not compile with C++ code generation. + output(x) = cast(input_i8(x) + input_i16(x) + input_i32(x) + input_i64(x)) + + cast(input_u8(x) + input_u16(x) + input_u32(x) + input_u64(x)) + + input_f32(x) + input_f64(x); + + // set estimates for the autoschedulers + input_i8.set_estimates({{0, 32}}); + input_i16.set_estimates({{0, 32}}); + input_i32.set_estimates({{0, 32}}); + input_i64.set_estimates({{0, 32}}); + input_u8.set_estimates({{0, 32}}); + input_u16.set_estimates({{0, 32}}); + input_u32.set_estimates({{0, 32}}); + input_u64.set_estimates({{0, 32}}); + input_f16.set_estimates({{0, 32}}); + input_f32.set_estimates({{0, 32}}); + input_f64.set_estimates({{0, 32}}); + input_bf16.set_estimates({{0, 32}}); + output.set_estimates({{0, 32}}); + + if (!using_autoscheduler()) { + output.vectorize(x, natural_vector_size()).compute_root(); + } + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(AllTypeNamesGeneric, all_type_names_generic) +HALIDE_REGISTER_GENERATOR_ALIAS(all_type_names, all_type_names_generic, {{"input_i8.type", "int8"}, {"input_i16.type", "int16"}, {"input_i32.type", "int32"}, {"input_i64.type", "int64"}, {"input_u8.type", "uint8"}, {"input_u16.type", "uint16"}, {"input_u32.type", "uint32"}, {"input_u64.type", "uint64"}, {"input_f16.type", "float16"}, {"input_f32.type", "float32"}, {"input_f64.type", "float64"}, {"input_bf16.type", "bfloat16"}, {"output.type", "float64"}}) From f25af7f4879b9e1d999be9b0229371337fc3a2a8 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 9 Nov 2023 06:27:20 +0100 Subject: [PATCH 05/15] Remove the deprecated API `llvm::Type::getInt8PtrTy` usage. (#7937) This API is removed in LLVM trunk now https://github.com/llvm/llvm-project/commit/7b9d73c2f90c0ed8497339a16fc39785349d9610. --- src/CodeGen_LLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index c6d17aff718c..057fb86ab1fe 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -5161,7 +5161,7 @@ llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t, return nullptr; } } else if (t.is_handle()) { - return llvm::Type::getInt8PtrTy(*c); + return llvm::PointerType::getUnqual(*c); } else { return llvm::Type::getIntNTy(*c, t.bits()); } From f0cdd50d2b2a1cff783549d29f65f10ccb4adab9 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 10:23:14 -0800 Subject: [PATCH 06/15] Delete unused function (#7925) --- src/CodeGen_LLVM.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 3f1467c0e203..ed62d7992977 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -402,11 +402,6 @@ class CodeGen_LLVM : public IRVisitor { void visit(const Realize *) override; // @} - /** If we have to bail out of a pipeline midway, this should - * inject the appropriate target-specific cleanup code. */ - virtual void prepare_for_early_exit() { - } - /** Get the llvm type equivalent to the given halide type in the * current context. */ virtual llvm::Type *llvm_type_of(const Type &) const; From 0f6543561316bedffeb2427c70988b38285cd172 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 11:48:34 -0800 Subject: [PATCH 07/15] More targeted fix for gather instructions being slow on intel processors (#7945) See https://github.com/llvm/llvm-project/issues/70259 --- src/CodeGen_LLVM.cpp | 2 +- src/CodeGen_LLVM.h | 7 ------- src/CodeGen_X86.cpp | 49 +++++++++++++++++++++++++++++--------------- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 057fb86ab1fe..18e70dfb3d87 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1122,7 +1122,7 @@ void CodeGen_LLVM::optimize_module() { PipelineTuningOptions pto; pto.LoopInterleaving = do_loop_opt; pto.LoopVectorization = do_loop_opt; - pto.SLPVectorization = use_slp_vectorization(); + pto.SLPVectorization = true; pto.LoopUnrolling = do_loop_opt; // Clear ScEv info for all loops. Certain Halide applications spend a very // long time compiling in forgetLoop, and prefer to forget everything diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index ed62d7992977..b3e9cdabd498 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -127,13 +127,6 @@ class CodeGen_LLVM : public IRVisitor { virtual bool use_pic() const; // @} - /** Should SLP vectorization be turned on in LLVM? SLP vectorization has no - * analogue in the Halide scheduling model so this is decided heuristically - * depending on the target. */ - virtual bool use_slp_vectorization() const { - return true; - } - /** Should indexing math be promoted to 64-bit on platforms with * 64-bit pointers? */ virtual bool promote_indices() const { diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index c43ad0d639bd..e34dd30870b4 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -68,8 +68,6 @@ class CodeGen_X86 : public CodeGen_Posix { bool use_soft_float_abi() const override; int native_vector_bits() const override; - bool use_slp_vectorization() const override; - int vector_lanes_for_slice(const Type &t) const; using CodeGen_Posix::visit; @@ -918,6 +916,34 @@ string CodeGen_X86::mcpu_target() const { } } +namespace { +bool gather_might_be_slow(Target target) { + // Intel x86 processors between broadwell and tiger lake have a microcode + // mitigation that makes gather instructions very slow. If we know we're on + // an AMD processor, gather is safe to use. If we have the AVX512 extensions + // present in Zen4 (or above), we also know we're not on an affected + // processor. + switch (target.processor_tune) { + case Target::Processor::AMDFam10: + case Target::Processor::BdVer1: + case Target::Processor::BdVer2: + case Target::Processor::BdVer3: + case Target::Processor::BdVer4: + case Target::Processor::BtVer1: + case Target::Processor::BtVer2: + case Target::Processor::K8: + case Target::Processor::K8_SSE3: + case Target::Processor::ZnVer1: + case Target::Processor::ZnVer2: + case Target::Processor::ZnVer3: + case Target::Processor::ZnVer4: + return false; + default: + return !target.has_feature(Target::AVX512_Zen4); + } +} +} // namespace + string CodeGen_X86::mcpu_tune() const { // Check if any explicit request for tuning exists. switch (target.processor_tune) { // Please keep sorted. @@ -995,6 +1021,11 @@ string CodeGen_X86::mattrs() const { features += ",+avxvnni,+amx-int8,+amx-bf16"; } } +#if LLVM_VERSION >= 180 + if (gather_might_be_slow(target)) { + features += ",+prefer-no-gather"; + } +#endif return features; } @@ -1030,20 +1061,6 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const { return slice_bits / t.bits(); } -bool CodeGen_X86::use_slp_vectorization() const { - if (target.has_feature(Target::AVX512)) { - // LLVM's SLP vectorizer emits avx512 gather intrinsics for LUTs and - // boundary conditions, even though they're slower than just - // scalarizing. See https://github.com/llvm/llvm-project/issues/70259 - // - // TODO: Once that issue is fixed, we should conditionalize this based on the - // LLVM version. - return false; - } else { - return true; - } -} - } // namespace std::unique_ptr new_CodeGen_X86(const Target &target) { From ad0f24e396c0eeee0c449c7355dd3d2fbcd17586 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 15 Nov 2023 16:49:35 -0800 Subject: [PATCH 08/15] Track likely values through lets in loop partitioning (#7930) * Track likely values through lets in loop partitioning Fixes #7929 Improves runtime of lens_blur app by ~20% * Add uncaptured likely tags to selects in boundary condition helpers Now that we look through lets, we end up in more situations where both sides have a captured likely. * Better comments --- src/BoundaryConditions.cpp | 29 ++++++-- src/PartitionLoops.cpp | 128 +++++++++++++++++++++++++----------- src/PartitionLoops.h | 13 ++-- test/correctness/likely.cpp | 20 ++++++ 4 files changed, 140 insertions(+), 50 deletions(-) diff --git a/src/BoundaryConditions.cpp b/src/BoundaryConditions.cpp index 9515ba3160a8..de8cbe705ab3 100644 --- a/src/BoundaryConditions.cpp +++ b/src/BoundaryConditions.cpp @@ -66,11 +66,11 @@ Func constant_exterior(const Func &source, const Tuple &value, if (value.as_vector().size() > 1) { std::vector def; for (size_t i = 0; i < value.as_vector().size(); i++) { - def.push_back(select(out_of_bounds, value[i], repeat_edge(source, bounds)(args)[i])); + def.push_back(select(out_of_bounds, value[i], likely(repeat_edge(source, bounds)(args)[i]))); } bounded(args) = Tuple(def); } else { - bounded(args) = select(out_of_bounds, value[0], repeat_edge(source, bounds)(args)); + bounded(args) = select(out_of_bounds, value[0], likely(repeat_edge(source, bounds)(args))); } return bounded; @@ -99,10 +99,25 @@ Func repeat_image(const Func &source, Expr coord = arg_var - min; // Enforce zero origin. coord = coord % extent; // Range is 0 to w-1 coord = coord + min; // Restore correct min - coord = select(arg_var < min || arg_var >= min + extent, coord, - clamp(likely(arg_var), min, min + extent - 1)); - + likely(clamp(likely(arg_var), min, min + extent - 1))); + + // In the line above, we want loop partitioning to both cause the + // clamp to go away, and also cause the select to go away. For loop + // partitioning to make one of these constructs go away we need one + // of two things to be true: + // + // 1) One arg has a likely intrinsic buried somewhere within it, and + // the other arg doesn't. + // 2) Both args have likely intrinsics, but in one of the args it is + // not within any inner min/max/select node. This is called an + // 'uncaptured' likely. + // + // The issue with this boundary condition is that the true branch of + // the select (coord) may well have a likely within it somewhere + // introduced by a loop tail strategy, so condition 1 doesn't + // hold. To be more robust, we make condition 2 hold, by introducing + // an uncaptured likely to the false branch. actuals.push_back(coord); } else if (!min.defined() && !extent.defined()) { actuals.push_back(arg_var); @@ -143,7 +158,7 @@ Func mirror_image(const Func &source, coord = coord + min; // Restore correct min coord = clamp(coord, min, min + extent - 1); coord = select(arg_var < min || arg_var >= min + extent, coord, - clamp(likely(arg_var), min, min + extent - 1)); + likely(clamp(likely(arg_var), min, min + extent - 1))); actuals.push_back(coord); } else if (!min.defined() && !extent.defined()) { actuals.push_back(arg_var); @@ -188,7 +203,7 @@ Func mirror_interior(const Func &source, // The boundary condition probably doesn't apply coord = select(arg_var < min || arg_var >= min + extent, coord, - clamp(likely(arg_var), min, min + extent - 1)); + likely(clamp(likely(arg_var), min, min + extent - 1))); actuals.push_back(coord); } else if (!min.defined() && !extent.defined()) { diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 7e2060d25c49..b9307a152889 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -88,13 +88,22 @@ class HasLikelyTag : public IRVisitor { } } + void visit(const Variable *op) override { + result |= scope.contains(op->name); + } + + const Scope<> &scope; + public: + HasLikelyTag(const Scope<> &s) + : scope(s) { + } + bool result = false; }; class HasUncapturedLikelyTag : public HasLikelyTag { using HasLikelyTag::visit; - // Any likelies buried inside the following ops are captured the by respective ops void visit(const Select *op) override { } @@ -102,6 +111,11 @@ class HasUncapturedLikelyTag : public HasLikelyTag { } void visit(const Max *op) override { } + +public: + HasUncapturedLikelyTag(const Scope<> &s) + : HasLikelyTag(s) { + } }; // The goal of loop partitioning is to split loops up into a prologue, @@ -243,6 +257,7 @@ class FindSimplifications : public IRVisitor { using IRVisitor::visit; Scope<> depends_on_loop_var, depends_on_invalid_buffers; + Scope<> vars_with_uncaptured_likely, vars_with_likely; Scope<> buffers; void visit(const Allocate *op) override { @@ -263,23 +278,20 @@ class FindSimplifications : public IRVisitor { } condition = remove_likelies(condition); Simplification s = {condition, std::move(old), std::move(likely_val), std::move(unlikely_val), true}; - while (s.condition.type().is_vector()) { - s.condition = simplify(s.condition); - if (const Broadcast *b = s.condition.as()) { - s.condition = b->value; - } else { - // Devectorize the condition - s.condition = and_condition_over_domain(s.condition, Scope::empty_scope()); - s.tight = false; - } - } - internal_assert(s.condition.type().is_scalar()) << s.condition << "\n"; simplifications.push_back(s); } + bool has_uncaptured_likely(const Expr &e) const { + return has_uncaptured_likely_tag(e, vars_with_uncaptured_likely); + } + + bool has_likely(const Expr &e) const { + return has_likely_tag(e, vars_with_likely); + } + void visit(const Min *op) override { - bool likely_a = has_uncaptured_likely_tag(op->a); - bool likely_b = has_uncaptured_likely_tag(op->b); + bool likely_a = has_uncaptured_likely(op->a); + bool likely_b = has_uncaptured_likely(op->b); // If one side has an uncaptured likely, don't hunt for // simplifications in the other side. @@ -294,20 +306,23 @@ class FindSimplifications : public IRVisitor { // call. If neither does, prefer the side that contains any // likely call at all. if (!likely_a && !likely_b) { - likely_a = has_likely_tag(op->a); - likely_b = has_likely_tag(op->b); + likely_a = has_likely(op->a); + likely_b = has_likely(op->b); } if (likely_b && !likely_a) { new_simplification(op->b <= op->a, op, op->b, op->a); } else if (likely_a && !likely_b) { new_simplification(op->a <= op->b, op, op->a, op->b); + } else if (likely_a && likely_b) { + // Likelies on both sides, continue inwards. + IRVisitor::visit(op); } } void visit(const Max *op) override { - bool likely_a = has_uncaptured_likely_tag(op->a); - bool likely_b = has_uncaptured_likely_tag(op->b); + bool likely_a = has_uncaptured_likely(op->a); + bool likely_b = has_uncaptured_likely(op->b); if (!likely_a) { op->b.accept(this); @@ -317,8 +332,8 @@ class FindSimplifications : public IRVisitor { } if (!likely_a && !likely_b) { - likely_a = has_likely_tag(op->a); - likely_b = has_likely_tag(op->b); + likely_a = has_likely(op->a); + likely_b = has_likely(op->b); } if (likely_b && !likely_a) { @@ -331,13 +346,8 @@ class FindSimplifications : public IRVisitor { void visit_select(const Expr &condition, const Expr &old, const Expr &true_value, const Expr &false_value) { condition.accept(this); - bool likely_t = has_uncaptured_likely_tag(true_value); - bool likely_f = has_uncaptured_likely_tag(false_value); - - if (!likely_t && !likely_f) { - likely_t = has_likely_tag(true_value); - likely_f = has_likely_tag(false_value); - } + bool likely_t = has_uncaptured_likely(true_value); + bool likely_f = has_uncaptured_likely(false_value); if (!likely_t) { false_value.accept(this); @@ -346,6 +356,11 @@ class FindSimplifications : public IRVisitor { true_value.accept(this); } + if (!likely_t && !likely_f) { + likely_t = has_likely(true_value); + likely_f = has_likely(false_value); + } + if (likely_t && !likely_f) { new_simplification(condition, old, true_value, false_value); } else if (likely_f && !likely_t) { @@ -376,7 +391,7 @@ class FindSimplifications : public IRVisitor { // statement is marked as likely, treat it as likely true and // partition accordingly. IRVisitor::visit(op); - if (has_uncaptured_likely_tag(op->condition)) { + if (has_uncaptured_likely(op->condition)) { new_simplification(op->condition, op->condition, const_true(), const_false()); } } @@ -408,7 +423,7 @@ class FindSimplifications : public IRVisitor { void visit(const Store *op) override { IRVisitor::visit(op); - if (has_uncaptured_likely_tag(op->predicate)) { + if (has_uncaptured_likely(op->predicate)) { const int lanes = op->predicate.type().lanes(); new_simplification(op->predicate, op->predicate, const_true(lanes), remove_likelies(op->predicate)); } @@ -416,7 +431,7 @@ class FindSimplifications : public IRVisitor { void visit(const Load *op) override { IRVisitor::visit(op); - if (has_uncaptured_likely_tag(op->predicate)) { + if (has_uncaptured_likely(op->predicate)) { const int lanes = op->predicate.type().lanes(); new_simplification(op->predicate, op->predicate, const_true(lanes), remove_likelies(op->predicate)); } @@ -429,6 +444,11 @@ class FindSimplifications : public IRVisitor { ScopedBinding<> bind_invalid(expr_uses_invalid_buffers(op->value, buffers) || expr_uses_vars(op->value, depends_on_invalid_buffers), depends_on_invalid_buffers, op->name); + ScopedBinding<> bind_uncaptured_likely(has_uncaptured_likely(op->value), + vars_with_uncaptured_likely, op->name); + ScopedBinding<> bind_likely(has_likely(op->value), + vars_with_likely, op->name); + vector old; old.swap(simplifications); IRVisitor::visit(op); @@ -566,6 +586,18 @@ class PartitionLoops : public IRMutator { vector middle_simps, prologue_simps, epilogue_simps; bool lower_bound_is_tight = true, upper_bound_is_tight = true; for (auto &s : finder.simplifications) { + + // Devectorize the condition + while (s.condition.type().is_vector()) { + s.condition = simplify(s.condition); + if (const Broadcast *b = s.condition.as()) { + s.condition = b->value; + } else { + s.condition = and_condition_over_domain(s.condition, Scope::empty_scope()); + s.tight = false; + } + } + // Solve for the interval over which this simplification is true. s.interval = solve_for_inner_interval(s.condition, op->name); if (s.tight) { @@ -991,24 +1023,44 @@ class ExpandSelects : public IRMutator { Expr visit(const Select *op) override { Expr condition = mutate(op->condition); + + const Call *true_likely = Call::as_intrinsic(op->true_value, {Call::likely}); + const Call *false_likely = Call::as_intrinsic(op->false_value, {Call::likely}); + Expr true_value = mutate(op->true_value); Expr false_value = mutate(op->false_value); if (const Or *o = condition.as()) { if (is_trivial(true_value)) { - return mutate(Select::make(o->a, true_value, Select::make(o->b, true_value, false_value))); + Expr expr = Select::make(o->b, true_value, false_value); + if (false_likely) { + expr = likely(expr); + } + return mutate(Select::make(o->a, true_value, expr)); } else { string var_name = unique_name('t'); Expr var = Variable::make(true_value.type(), var_name); - Expr expr = mutate(Select::make(o->a, var, Select::make(o->b, var, false_value))); + Expr expr = Select::make(o->b, var, false_value); + if (false_likely) { + expr = likely(expr); + } + expr = mutate(Select::make(o->a, var, expr)); return Let::make(var_name, true_value, expr); } } else if (const And *a = condition.as()) { if (is_trivial(false_value)) { - return mutate(Select::make(a->a, Select::make(a->b, true_value, false_value), false_value)); + Expr expr = Select::make(a->b, true_value, false_value); + if (true_likely) { + expr = likely(expr); + } + return mutate(Select::make(a->a, expr, false_value)); } else { string var_name = unique_name('t'); Expr var = Variable::make(false_value.type(), var_name); - Expr expr = mutate(Select::make(a->a, Select::make(a->b, true_value, var), var)); + Expr expr = Select::make(a->b, true_value, var); + if (true_likely) { + expr = likely(expr); + } + expr = mutate(Select::make(a->a, expr, var)); return Let::make(var_name, false_value, expr); } } else if (const Not *n = condition.as()) { @@ -1098,14 +1150,14 @@ class LowerLikelyIfInnermost : public IRMutator { } // namespace -bool has_uncaptured_likely_tag(const Expr &e) { - HasUncapturedLikelyTag h; +bool has_uncaptured_likely_tag(const Expr &e, const Scope<> &scope) { + HasUncapturedLikelyTag h(scope); e.accept(&h); return h.result; } -bool has_likely_tag(const Expr &e) { - HasLikelyTag h; +bool has_likely_tag(const Expr &e, const Scope<> &scope) { + HasLikelyTag h(scope); e.accept(&h); return h.result; } diff --git a/src/PartitionLoops.h b/src/PartitionLoops.h index 03f92e4d4c0b..40044914712d 100644 --- a/src/PartitionLoops.h +++ b/src/PartitionLoops.h @@ -8,16 +8,19 @@ */ #include "Expr.h" +#include "Scope.h" namespace Halide { namespace Internal { -/** Return true if an expression uses a likely tag that isn't captured - * by an enclosing Select, Min, or Max. */ -bool has_uncaptured_likely_tag(const Expr &e); +/** Return true if an expression uses a likely tag that isn't captured by an + * enclosing Select, Min, or Max. The scope contains all vars that should be + * considered to have uncaptured likelies. */ +bool has_uncaptured_likely_tag(const Expr &e, const Scope<> &scope); -/** Return true if an expression uses a likely tag. */ -bool has_likely_tag(const Expr &e); +/** Return true if an expression uses a likely tag. The scope contains all vars + * in scope that should be considered to have likely tags. */ +bool has_likely_tag(const Expr &e, const Scope<> &scope); /** Partitions loop bodies into a prologue, a steady state, and an * epilogue. Finds the steady state by hunting for use of clamped diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp index fe834199f015..110df348da7e 100644 --- a/test/correctness/likely.cpp +++ b/test/correctness/likely.cpp @@ -287,6 +287,26 @@ int main(int argc, char **argv) { result = g.realize({10}); } + // Test for the bug described in https://github.com/halide/Halide/issues/7929 + { + Func f, g, h; + Var x, y; + + f(x, y) = x; + f.compute_root(); + + Param p; + g = BoundaryConditions::repeat_edge(f, {{0, p}, {Expr(), Expr()}}); + + h(x, y) = g(x, y) + g(x, y + 1) + g(x, y + 2); + + count_partitions(h, 3); + + // Same thing with vectorization too. + h.vectorize(x, 8); + count_partitions(h, 3); + } + // The performance of this behavior is tested in // test/performance/boundary_conditions.cpp From f5a4e49904c32717ba5c8231c18d55244d405c09 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 21 Nov 2023 11:23:44 -0800 Subject: [PATCH 09/15] Add missing condition to if renesting rule (#7952) * Add missing condition to if renesting rule * Add test * clang-format --- src/Simplify_And.cpp | 1 + src/Simplify_Stmts.cpp | 1 + test/correctness/simplify.cpp | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/src/Simplify_And.cpp b/src/Simplify_And.cpp index 31975da9b9cb..35bbd5f7f747 100644 --- a/src/Simplify_And.cpp +++ b/src/Simplify_And.cpp @@ -56,6 +56,7 @@ Expr Simplify::visit(const And *op, ExprInfo *bounds) { rewrite(x && !x, false) || rewrite(!x && x, false) || rewrite(y <= x && x < y, false) || + rewrite(y < x && x < y, false) || rewrite(x != c0 && x == c1, b, c0 != c1) || rewrite(x == c0 && x == c1, false, c0 != c1) || // Note: In the predicate below, if undefined overflow diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp index e3c0f6ee178e..11b146ecdc6a 100644 --- a/src/Simplify_Stmts.cpp +++ b/src/Simplify_Stmts.cpp @@ -596,6 +596,7 @@ Stmt Simplify::visit(const Block *op) { return mutate(result); } else if (if_first && if_next && + !if_next->else_case.defined() && is_pure(if_first->condition) && is_pure(if_next->condition) && is_const_one(mutate(!(if_first->condition && if_next->condition), nullptr))) { diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp index 900c338e86da..6f497531da94 100644 --- a/test/correctness/simplify.cpp +++ b/test/correctness/simplify.cpp @@ -1644,6 +1644,24 @@ void check_boolean() { Block::make(not_no_op(x + 1), not_no_op(x + 2)), not_no_op(x + 3))); + check(x < y && y < x, const_false()); + check(Block::make(IfThenElse::make(x < y, not_no_op(x + 1), not_no_op(x + 2)), + IfThenElse::make(y < x, not_no_op(x + 3))), + IfThenElse::make(x < y, not_no_op(x + 1), + Block::make(not_no_op(x + 2), + IfThenElse::make(y < x, not_no_op(x + 3))))); + + check(Block::make(IfThenElse::make(x < y, not_no_op(x + 1), not_no_op(x + 2)), + IfThenElse::make(y <= x, not_no_op(x + 3))), + IfThenElse::make(x < y, not_no_op(x + 1), + Block::make(not_no_op(x + 2), + not_no_op(x + 3)))); + + check(Block::make(IfThenElse::make(x < y, not_no_op(x + 1), not_no_op(x + 2)), + IfThenElse::make(y <= x, not_no_op(x + 3), not_no_op(x + 4))), + Block::make(IfThenElse::make(x < y, not_no_op(x + 1), not_no_op(x + 2)), + IfThenElse::make(y <= x, not_no_op(x + 3), not_no_op(x + 4)))); + // The construct // if (var == expr) then a else b; // was being simplified incorrectly, but *only* if var was of type Bool. From 04c21bf6e5a9d75724a269fff725b82f973813c3 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Tue, 21 Nov 2023 13:56:45 -0800 Subject: [PATCH 10/15] Always call lower_round_to_nearest_ties_to_even on arm32 (#7957) --- src/CodeGen_ARM.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 87e170da98d1..826f3723e4bf 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1354,8 +1354,7 @@ void CodeGen_ARM::visit(const Call *op) { if (value) { return; } - } else if (target.os != Target::Linux) { - // Furthermore, roundevenf isn't always in the standard library on arm-32 + } else { value = codegen(lower_round_to_nearest_ties_to_even(op->args[0])); return; } From 8c28a73d3583de7765387e70424061a652d246b0 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 21 Nov 2023 15:27:21 -0800 Subject: [PATCH 11/15] Improve code size and compile time for local laplacian app (#7927) Improve code size and compile time for local laplacian and interpolate apps This reduces compile time for the manual local laplacian schedule from 4.9s to 2.2s, and reduces code size from 126k to 82k Most of the reduction comes from avoiding a pointless boundary condition in the output Func. A smaller amount comes from avoiding loop partitioning using RoundUp and Partition::Never. The Partition::Never calls are responsible for a 3% reduction in code size and compile times by themselves. This has basically no effect on runtime. It seems to reduce it very slightly, but it's in the noise. --- apps/interpolate/Makefile | 1 + apps/interpolate/interpolate_generator.cpp | 12 +++- apps/local_laplacian/Makefile | 1 + .../local_laplacian_generator.cpp | 52 +++++++++++++---- src/Func.cpp | 56 +++++++++++++++++++ src/Func.h | 53 ++++++++++++++++++ src/Generator.h | 1 + src/LoopPartitioningDirective.h | 5 +- src/PartitionLoops.cpp | 14 ++++- test/correctness/likely.cpp | 28 ++++++---- 10 files changed, 196 insertions(+), 27 deletions(-) diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile index 95c165b533ee..e5760d9f0039 100644 --- a/apps/interpolate/Makefile +++ b/apps/interpolate/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/filter diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index 1e4026b9ef87..ca751bab253f 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator { Var yo, yi, xo, xi, ci, xii, yii; if (get_target().has_gpu_feature()) { normalize + .never_partition_all() .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) @@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator { for (int l = 1; l < levels; l++) { downsampled[l] .compute_root() + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_tile(x, y, xi, yi, 16, 16); @@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator { for (int l = 3; l < levels; l += 2) { interpolated[l] .compute_root() + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp) .tile(xi, yi, xii, yii, 2, 2) @@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator { upsampledx[1] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 1) .unroll(xi) @@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator { interpolated[1] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 2) .unroll(xi) @@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator { interpolated[2] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_threads(x, y); @@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator { // the local_laplacian app. downsampled[l] .compute_root() + .never_partition(x) .reorder(x, c, y) .split(y, yo, yi, 8) .parallel(yo) @@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator { .compute_at(downsampled[1], yi) .reorder(c, x, y) .unroll(c) - .vectorize(x, vec); + .vectorize(x, vec) + .never_partition(y); normalize .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) + .never_partition(y) .split(x, xo, xi, vec) .split(y, yo, yi, 32) .reorder(xi, c, xo, yi, yo) @@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator { interpolated[l] .store_at(normalize, yo) .compute_at(normalize, yi) + .never_partition_all() .vectorize(x, vec); } diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile index a9f57b4de81a..a2c9991151f8 100644 --- a/apps/local_laplacian/Makefile +++ b/apps/local_laplacian/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/process diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index ef305837c6cc..860540e74517 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator { // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) Func color; float eps = 0.01f; - color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps); + color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps); // Convert back to 16-bit - output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); + output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 65535.0f)); /* ESTIMATES */ // (This can be useful in conjunction with RunGen and benchmarks as well @@ -102,10 +102,15 @@ class LocalLaplacian : public Halide::Generator { // Nothing. } else if (get_target().has_gpu_feature()) { // GPU schedule. - // 3.19ms on an RTX 2060. + // 2.9ms on an RTX 2060. + + // All loop partitioning disabled, which has no effect on runtime, + // but saves 15% compile time and 45% ptx shader code size. remap.compute_root(); Var xi, yi; - output.compute_root().gpu_tile(x, y, xi, yi, 16, 8); + output.compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, 16, 8); for (int j = 0; j < J; j++) { int blockw = 16, blockh = 8; if (j > 3) { @@ -113,10 +118,20 @@ class LocalLaplacian : public Halide::Generator { blockh = 2; } if (j > 0) { - inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); - gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh); + inGPyramid[j] + .compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); + gPyramid[j] + .compute_root() + .reorder(k, x, y) + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); } - outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); + outGPyramid[j] + .compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); } } else { // CPU schedule. @@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator { remap.compute_root(); Var yo; - output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8); - gray.compute_root().parallel(y, 32).vectorize(x, 8); + output + .reorder(c, x, y) + .split(y, yo, y, 64) + .parallel(yo) + .vectorize(x, 8); + gray + .compute_root() + .never_partition(y) + .parallel(y, 32) + .vectorize(x, 8); for (int j = 1; j < 5; j++) { inGPyramid[j] .compute_root() @@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator { .store_at(output, yo) .compute_at(output, y) .fold_storage(y, 4) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); + if (j > 1) { + // Turn off loop partitioning at higher pyramid levels. This + // shaves about 3% off code size and compile time without + // affecting performance. + inGPyramid[j].never_partition_all(); + gPyramid[j].never_partition_all(); + } } outGPyramid[0] .compute_at(output, y) .hoist_storage(output, yo) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); for (int j = 5; j < J; j++) { inGPyramid[j].compute_root(); gPyramid[j].compute_root().parallel(k); diff --git a/src/Func.cpp b/src/Func.cpp index a8190876c6b2..37b64df5af5b 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -1649,6 +1649,38 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) { return *this; } +Stage &Stage::never_partition(const std::vector &vars) { + for (const auto &v : vars) { + partition(v, Partition::Never); + } + return *this; +} + +Stage &Stage::never_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Never; + } + return *this; +} + +Stage &Stage::always_partition(const std::vector &vars) { + for (const auto &v : vars) { + partition(v, Partition::Always); + } + return *this; +} + +Stage &Stage::always_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Always; + } + return *this; +} + Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, @@ -2342,6 +2374,30 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) { return *this; } +Func &Func::never_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition(vars); + return *this; +} + +Func &Func::never_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition_all(); + return *this; +} + +Func &Func::always_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition(vars); + return *this; +} + +Func &Func::always_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition_all(); + return *this; +} + Func &Func::bound(const Var &var, Expr min, Expr extent) { user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n"; user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n"; diff --git a/src/Func.h b/src/Func.h index 2cad7160b823..ccadef338c29 100644 --- a/src/Func.h +++ b/src/Func.h @@ -349,6 +349,11 @@ class Stage { Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &partition(const VarOrRVar &var, Partition partition_policy); + Stage &never_partition_all(); + Stage &never_partition(const std::vector &vars); + Stage &always_partition_all(); + Stage &always_partition(const std::vector &vars); + Stage &tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, @@ -380,6 +385,20 @@ class Stage { return reorder(collected_args); } + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name); Stage specialize(const Expr &condition); void specialize_fail(const std::string &message); @@ -1450,6 +1469,40 @@ class Func { * The default policy is Auto. */ Func &partition(const VarOrRVar &var, Partition partition_policy); + /** Set the loop partition policy to Never for a vector of Vars and + * RVars. */ + Func &never_partition(const std::vector &vars); + + /** Set the loop partition policy to Never for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + + /** Set the loop partition policy to Never for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &never_partition_all(); + + /** Set the loop partition policy to Always for a vector of Vars and + * RVars. */ + Func &always_partition(const std::vector &vars); + + /** Set the loop partition policy to Always for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + + /** Set the loop partition policy to Always for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &always_partition_all(); + /** Statically declare that the range over which a function should * be evaluated is given by the second and third arguments. This * can let Halide perform some optimizations. E.g. if you know diff --git a/src/Generator.h b/src/Generator.h index 78357d59a156..4d00a0fec574 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -3056,6 +3056,7 @@ class NamesInterface { using LoopLevel = Halide::LoopLevel; using MemoryType = Halide::MemoryType; using NameMangling = Halide::NameMangling; + using Partition = Halide::Partition; using Pipeline = Halide::Pipeline; using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy; using RDom = Halide::RDom; diff --git a/src/LoopPartitioningDirective.h b/src/LoopPartitioningDirective.h index 3189add52d1a..c4c14de48f2a 100644 --- a/src/LoopPartitioningDirective.h +++ b/src/LoopPartitioningDirective.h @@ -20,8 +20,9 @@ enum class Partition { /** Disallow loop partitioning. */ Never, - /** Force partitioning of the loop. If Halide can't find a way to partition this loop, - * it will raise an error. */ + /** Force partitioning of the loop, even in the tail cases of outer + * partitioned loops. If Halide can't find a way to partition this loop, it + * will raise an error. */ Always }; diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index b9307a152889..99b7a7cc25e1 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -537,10 +537,13 @@ class PartitionLoops : public IRMutator { using IRMutator::visit; bool in_gpu_loop = false; + bool in_tail = false; Stmt visit(const For *op) override { - // Do not partition if the schedule explicitly forbids. - if (op->partition_policy == Partition::Never) { + // Do not partition if the schedule explicitly forbids, or if it's set + // to automatic and we're in a loop tail. + if (op->partition_policy == Partition::Never || + (op->partition_policy == Partition::Auto && in_tail)) { return IRMutator::visit(op); } @@ -719,6 +722,13 @@ class PartitionLoops : public IRMutator { // Recurse on the middle section. simpler_body = mutate(simpler_body); + // Recurse on the prologue and epilogue, just for loops set to Partition::Always + { + ScopedValue s(in_tail, true); + epilogue = mutate(epilogue); + prologue = mutate(prologue); + } + // Construct variables for the bounds of the simplified middle section Expr min_steady = op->min, max_steady = op->extent + op->min; Expr prologue_val, epilogue_val; diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp index 110df348da7e..d2888719ffd0 100644 --- a/test/correctness/likely.cpp +++ b/test/correctness/likely.cpp @@ -127,12 +127,12 @@ int main(int argc, char **argv) { count_partitions(g, 1); } - // The slicing applies to every loop level starting from the - // outermost one, but only recursively simplifies the clean steady - // state. It either splits things three (start, middle, end). So - // adding a boundary condition to a 2D computation will produce 5 - // code paths for the top, bottom, left, right, and center of the - // image. + // The slicing applies to every loop level starting from the outermost one, + // but only recursively simplifies the clean steady state. It either splits + // things three (start, middle, end). So adding a boundary condition to a 2D + // computation will produce 5 code paths for the top, bottom, left, right, + // and center of the image. With explicit control over loop partitioning, we + // might produce more or fewer. { Var y; Func g; @@ -144,7 +144,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition y, always partition x:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Never); count_partitions(h2, 3); // We expect left-center-right @@ -153,7 +152,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x, always partition y:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Always); count_partitions(h2, 3); // We expect top-middle-bottom @@ -162,7 +160,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Never); count_partitions(h2, 1); @@ -171,10 +168,19 @@ int main(int argc, char **argv) { { debug(1) << "Always partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Always); - count_partitions(h2, 5); + // All loops get partitioned, including the tails of outer loops, so we expect 9 zones: + /* + ---------------------------------------------- + | top left | top middle | top right | + | ------------------------------------------ | + | left | middle | right | + | ------------------------------------------ | + | bottom left | bottom middle | bottom right | + ---------------------------------------------- + */ + count_partitions(h2, 9); } } From 976ea0b49515a5a92ebd40c191852100c235ed51 Mon Sep 17 00:00:00 2001 From: Derek Gerstmann Date: Mon, 27 Nov 2023 16:55:41 -0800 Subject: [PATCH 12/15] [serialization] Serialize stub definitions of external parameters. (#7926) * Serialize stub definitions of external parameters. Add deserialize_parameter methods to allow the user to only deserialize the mapping of external parameters (and remap them to their own user parameters) prior to deserializing the full pipeline definition. * Clang tidy/format pass --------- Co-authored-by: Derek Gerstmann --- src/Deserialization.cpp | 153 ++++++++++++++++++++++++--- src/Deserialization.h | 34 ++++-- src/Serialization.cpp | 20 +++- src/halide_ir.fbs | 8 ++ tutorial/lesson_23_serialization.cpp | 8 +- 5 files changed, 194 insertions(+), 29 deletions(-) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index bb19cf82c9aa..9923e9d1c89c 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -23,8 +23,8 @@ class Deserializer { public: Deserializer() = default; - explicit Deserializer(const std::map &external_params) - : external_params(external_params) { + explicit Deserializer(const std::map &user_params) + : user_params(user_params) { } // Deserialize a pipeline from the given filename @@ -36,6 +36,16 @@ class Deserializer { // Deserialize a pipeline from the given buffer of bytes Pipeline deserialize(const std::vector &data); + // Deserialize just the unbound external parameters that need to be defined for the pipeline from the given filename + // (so they can be remapped and overridden with user parameters prior to deserializing the pipeline) + std::map deserialize_parameters(const std::string &filename); + + // Deserialize just the unbound external parameters that need to be defined for the pipeline from the given input stream + std::map deserialize_parameters(std::istream &in); + + // Deserialize just the unbound external parameters that need to be defined for the pipeline from the given buffer of bytes + std::map deserialize_parameters(const std::vector &data); + private: // Helper function to deserialize a homogenous vector from a flatbuffer vector, // does not apply to union types like Stmt and Expr or enum types like MemoryType @@ -63,6 +73,9 @@ class Deserializer { std::map> buffers_in_pipeline; // External parameters that are not deserialized but will be used in the pipeline + std::map user_params; + + // Default external parameters that were created during deserialization std::map external_params; MemoryType deserialize_memory_type(Serialize::MemoryType memory_type); @@ -139,6 +152,8 @@ class Deserializer { Parameter deserialize_parameter(const Serialize::Parameter *parameter); + Parameter deserialize_external_parameter(const Serialize::ExternalParameter *external_parameter); + ExternFuncArgument deserialize_extern_func_argument(const Serialize::ExternFuncArgument *extern_func_argument); std::map deserialize_wrapper_refs(const flatbuffers::Vector> *wrappers); @@ -457,12 +472,15 @@ void Deserializer::deserialize_function(const Serialize::Func *function, Functio deserialize_vector(function->updates(), &Deserializer::deserialize_definition); const std::string debug_file = deserialize_string(function->debug_file()); + std::vector output_buffers; output_buffers.reserve(function->output_buffers_names()->size()); for (const auto &output_buffer_name_serialized : *function->output_buffers_names()) { auto output_buffer_name = deserialize_string(output_buffer_name_serialized); Parameter output_buffer; - if (auto it = external_params.find(output_buffer_name); it != external_params.end()) { + if (auto it = user_params.find(output_buffer_name); it != user_params.end()) { + output_buffer = it->second; + } else if (auto it = external_params.find(output_buffer_name); it != external_params.end()) { output_buffer = it->second; } else if (auto it = parameters_in_pipeline.find(output_buffer_name); it != parameters_in_pipeline.end()) { output_buffer = it->second; @@ -534,7 +552,9 @@ Stmt Deserializer::deserialize_stmt(Serialize::Stmt type_code, const void *stmt) const auto index = deserialize_expr(store_stmt->index_type(), store_stmt->index()); const auto param_name = deserialize_string(store_stmt->param_name()); Parameter param; - if (auto it = external_params.find(param_name); it != external_params.end()) { + if (auto it = user_params.find(param_name); it != user_params.end()) { + param = it->second; + } else if (auto it = external_params.find(param_name); it != external_params.end()) { param = it->second; } else if (auto it = parameters_in_pipeline.find(param_name); it != parameters_in_pipeline.end()) { param = it->second; @@ -799,7 +819,9 @@ Expr Deserializer::deserialize_expr(Serialize::Expr type_code, const void *expr) } const auto param_name = deserialize_string(load_expr->param_name()); Parameter param; - if (auto it = external_params.find(param_name); it != external_params.end()) { + if (auto it = user_params.find(param_name); it != user_params.end()) { + param = it->second; + } else if (auto it = external_params.find(param_name); it != external_params.end()) { param = it->second; } else if (auto it = parameters_in_pipeline.find(param_name); it != parameters_in_pipeline.end()) { param = it->second; @@ -850,7 +872,9 @@ Expr Deserializer::deserialize_expr(Serialize::Expr type_code, const void *expr) } const auto param_name = deserialize_string(call_expr->param_name()); Parameter param; - if (auto it = external_params.find(param_name); it != external_params.end()) { + if (auto it = user_params.find(param_name); it != user_params.end()) { + param = it->second; + } else if (auto it = external_params.find(param_name); it != external_params.end()) { param = it->second; } else if (auto it = parameters_in_pipeline.find(param_name); it != parameters_in_pipeline.end()) { param = it->second; @@ -866,7 +890,9 @@ Expr Deserializer::deserialize_expr(Serialize::Expr type_code, const void *expr) const auto type = deserialize_type(variable_expr->type()); const auto param_name = deserialize_string(variable_expr->param_name()); Parameter param; - if (auto it = external_params.find(param_name); it != external_params.end()) { + if (auto it = user_params.find(param_name); it != user_params.end()) { + param = it->second; + } else if (auto it = external_params.find(param_name); it != external_params.end()) { param = it->second; } else if (auto it = parameters_in_pipeline.find(param_name); it != parameters_in_pipeline.end()) { param = it->second; @@ -1224,6 +1250,15 @@ Parameter Deserializer::deserialize_parameter(const Serialize::Parameter *parame } } +Parameter Deserializer::deserialize_external_parameter(const Serialize::ExternalParameter *external_parameter) { + user_assert(external_parameter != nullptr); + const bool is_buffer = external_parameter->is_buffer(); + const auto type = deserialize_type(external_parameter->type()); + const int dimensions = external_parameter->dimensions(); + const std::string name = deserialize_string(external_parameter->name()); + return Parameter(type, is_buffer, dimensions, name); +} + ExternFuncArgument Deserializer::deserialize_extern_func_argument(const Serialize::ExternFuncArgument *extern_func_argument) { user_assert(extern_func_argument != nullptr); const auto arg_type = deserialize_extern_func_argument_type(extern_func_argument->arg_type()); @@ -1249,7 +1284,9 @@ ExternFuncArgument Deserializer::deserialize_extern_func_argument(const Serializ } else { const auto image_param_name = deserialize_string(extern_func_argument->image_param_name()); Parameter image_param; - if (auto it = external_params.find(image_param_name); it != external_params.end()) { + if (auto it = user_params.find(image_param_name); it != user_params.end()) { + image_param = it->second; + } else if (auto it = external_params.find(image_param_name); it != external_params.end()) { image_param = it->second; } else if (auto it = parameters_in_pipeline.find(image_param_name); it != parameters_in_pipeline.end()) { image_param = it->second; @@ -1397,6 +1434,13 @@ Pipeline Deserializer::deserialize(const std::vector &data) { parameters_in_pipeline[param.name()] = param; } + const std::vector parameters_external = + deserialize_vector(pipeline_obj->external_parameters(), + &Deserializer::deserialize_external_parameter); + for (const auto ¶m : parameters_external) { + external_params[param.name()] = param; + } + std::vector funcs; for (size_t i = 0; i < pipeline_obj->funcs()->size(); ++i) { deserialize_function(pipeline_obj->funcs()->Get(i), functions[i]); @@ -1427,44 +1471,119 @@ Pipeline Deserializer::deserialize(const std::vector &data) { return Pipeline(output_funcs, requirements); } +std::map Deserializer::deserialize_parameters(const std::string &filename) { + std::map empty; + std::ifstream in(filename, std::ios::binary | std::ios::in); + if (!in) { + user_error << "failed to open file " << filename << "\n"; + return empty; + } + std::map params = deserialize_parameters(in); + if (!in.good()) { + user_error << "failed to deserialize from file " << filename << " properly\n"; + return empty; + } + in.close(); + return params; +} + +std::map Deserializer::deserialize_parameters(std::istream &in) { + std::map empty; + if (!in) { + user_error << "failed to open input stream\n"; + return empty; + } + in.seekg(0, std::ios::end); + int size = in.tellg(); + in.seekg(0, std::ios::beg); + std::vector data(size); + in.read((char *)data.data(), size); + return deserialize_parameters(data); +} + +std::map Deserializer::deserialize_parameters(const std::vector &data) { + std::map external_parameters_by_name; + const auto *pipeline_obj = Serialize::GetPipeline(data.data()); + if (pipeline_obj == nullptr) { + user_warning << "deserialized pipeline is empty\n"; + return external_parameters_by_name; + } + + const std::vector external_parameters = + deserialize_vector(pipeline_obj->external_parameters(), + &Deserializer::deserialize_external_parameter); + + for (const auto ¶m : external_parameters) { + external_parameters_by_name[param.name()] = param; + } + return external_parameters_by_name; +} + } // namespace Internal -Pipeline deserialize_pipeline(const std::string &filename, const std::map &external_params) { - Internal::Deserializer deserializer(external_params); +Pipeline deserialize_pipeline(const std::string &filename, const std::map &user_params) { + Internal::Deserializer deserializer(user_params); return deserializer.deserialize(filename); } -Pipeline deserialize_pipeline(std::istream &in, const std::map &external_params) { - Internal::Deserializer deserializer(external_params); +Pipeline deserialize_pipeline(std::istream &in, const std::map &user_params) { + Internal::Deserializer deserializer(user_params); return deserializer.deserialize(in); } -Pipeline deserialize_pipeline(const std::vector &buffer, const std::map &external_params) { - Internal::Deserializer deserializer(external_params); +Pipeline deserialize_pipeline(const std::vector &buffer, const std::map &user_params) { + Internal::Deserializer deserializer(user_params); return deserializer.deserialize(buffer); } +std::map deserialize_parameters(const std::string &filename) { + Internal::Deserializer deserializer; + return deserializer.deserialize_parameters(filename); +} + +std::map deserialize_parameters(std::istream &in) { + Internal::Deserializer deserializer; + return deserializer.deserialize_parameters(in); +} + +std::map deserialize_parameters(const std::vector &buffer) { + Internal::Deserializer deserializer; + return deserializer.deserialize_parameters(buffer); +} + } // namespace Halide #else // WITH_SERIALIZATION namespace Halide { -Pipeline deserialize_pipeline(const std::string &filename, const std::map &external_params) { +Pipeline deserialize_pipeline(const std::string &filename, const std::map &user_params) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; return Pipeline(); } -Pipeline deserialize_pipeline(std::istream &in, const std::map &external_params) { +Pipeline deserialize_pipeline(std::istream &in, const std::map &user_params) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; return Pipeline(); } -Pipeline deserialize_pipeline(const std::vector &buffer, const std::map &external_params) { +Pipeline deserialize_pipeline(const std::vector &buffer, const std::map &user_params) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; return Pipeline(); } +std::map deserialize_parameters(const std::string &filename) { + user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; +} + +std::map deserialize_parameters(std::istream &in) { + user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; +} + +std::map deserialize_parameters(const std::vector &buffer) { + user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; +} + } // namespace Halide #endif // WITH_SERIALIZATION diff --git a/src/Deserialization.h b/src/Deserialization.h index 82f7c8e7217b..b4b3844303c0 100644 --- a/src/Deserialization.h +++ b/src/Deserialization.h @@ -9,21 +9,43 @@ namespace Halide { /// @brief Deserialize a Halide pipeline from a file. /// @param filename The location of the file to deserialize. Must use .hlpipe extension. -/// @param external_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). +/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). /// @return Returns a newly constructed deserialized Pipeline object/ -Pipeline deserialize_pipeline(const std::string &filename, const std::map &external_params); +Pipeline deserialize_pipeline(const std::string &filename, const std::map &user_params); /// @brief Deserialize a Halide pipeline from an input stream. /// @param in The input stream to read from containing a serialized Halide pipeline -/// @param external_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). +/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). /// @return Returns a newly constructed deserialized Pipeline object/ -Pipeline deserialize_pipeline(std::istream &in, const std::map &external_params); +Pipeline deserialize_pipeline(std::istream &in, const std::map &user_params); /// @brief Deserialize a Halide pipeline from a byte buffer containing a serizalized pipeline in binary format /// @param data The data buffer containing a serialized Halide pipeline -/// @param external_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). +/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead). /// @return Returns a newly constructed deserialized Pipeline object/ -Pipeline deserialize_pipeline(const std::vector &data, const std::map &external_params); +Pipeline deserialize_pipeline(const std::vector &data, const std::map &user_params); + +/// @brief Deserialize the extenal parameters for the Halide pipeline from a file. +/// This method allows a minimal deserialization of just the external pipeline parameters, so they can be +/// remapped and overridden with user parameters prior to deserializing the pipeline definition. +/// @param filename The location of the file to deserialize. Must use .hlpipe extension. +/// @return Returns a map containing the names and description of external parameters referenced in the pipeline +std::map deserialize_parameters(const std::string &filename); + +/// @brief Deserialize the extenal parameters for the Halide pipeline from input stream. +/// This method allows a minimal deserialization of just the external pipeline parameters, so they can be +/// remapped and overridden with user parameters prior to deserializing the pipeline definition. +/// @param in The input stream to read from containing a serialized Halide pipeline +/// @return Returns a map containing the names and description of external parameters referenced in the pipeline +std::map deserialize_parameters(std::istream &in); + +/// @brief Deserialize the extenal parameters for the Halide pipeline from a byte buffer containing a serialized +/// pipeline in binary format. This method allows a minimal deserialization of just the external pipeline +/// parameters, so they can be remapped and overridden with user parameters prior to deserializing the +/// pipeline definition. +/// @param data The data buffer containing a serialized Halide pipeline +/// @return Returns a map containing the names and description of external parameters referenced in the pipeline +std::map deserialize_parameters(const std::vector &data); } // namespace Halide diff --git a/src/Serialization.cpp b/src/Serialization.cpp index c85eaa15e1aa..e29c7e053179 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -127,6 +127,8 @@ class Serializer { Offset serialize_parameter(FlatBufferBuilder &builder, const Parameter ¶meter); + Offset serialize_external_parameter(FlatBufferBuilder &builder, const Parameter ¶meter); + Offset serialize_extern_func_argument(FlatBufferBuilder &builder, const ExternFuncArgument &extern_func_argument); Offset serialize_buffer(FlatBufferBuilder &builder, Buffer<> buffer); @@ -1351,6 +1353,14 @@ Offset Serializer::serialize_parameter(FlatBufferBuilder & } } +Offset Serializer::serialize_external_parameter(FlatBufferBuilder &builder, const Parameter ¶meter) { + const auto type_serialized = serialize_type(builder, parameter.type()); + const int dimensions = parameter.dimensions(); + const auto name_serialized = serialize_string(builder, parameter.name()); + const bool is_buffer = parameter.is_buffer(); + return Serialize::CreateExternalParameter(builder, is_buffer, type_serialized, dimensions, name_serialized); +} + Offset Serializer::serialize_extern_func_argument(FlatBufferBuilder &builder, const ExternFuncArgument &extern_func_argument) { const auto arg_type_serialized = serialize_extern_func_argument_type(extern_func_argument.arg_type); if (extern_func_argument.arg_type == ExternFuncArgument::ArgType::UndefinedArg) { @@ -1472,12 +1482,19 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul std::vector> parameters_serialized; parameters_serialized.reserve(parameters_in_pipeline.size()); for (const auto ¶m : parameters_in_pipeline) { - // we only serialize internal parameters with the pipeline + // we only serialize the definitions of internal parameters with the pipeline if (external_parameters.find(param.first) == external_parameters.end()) { parameters_serialized.push_back(serialize_parameter(builder, param.second)); } } + // Serialize only the metadata describing external parameters (to allow the to be created with defaults upon deserialization) + std::vector> external_parameters_serialized; + external_parameters_serialized.reserve(external_parameters.size()); + for (const auto ¶m : external_parameters) { + external_parameters_serialized.push_back(serialize_external_parameter(builder, param.second)); + } + std::vector> buffers_serialized; buffers_serialized.reserve(buffers_in_pipeline.size()); for (auto &buffer : buffers_in_pipeline) { @@ -1491,6 +1508,7 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul builder.CreateVector(requirements_serialized), builder.CreateVector(func_names_in_order_serialized), builder.CreateVector(parameters_serialized), + builder.CreateVector(external_parameters_serialized), builder.CreateVector(buffers_serialized)); builder.Finish(pipeline_obj); diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index f3d27e83a62a..479e488b6739 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -640,6 +640,13 @@ table Parameter { scalar_estimate: Expr; } +table ExternalParameter { + is_buffer: bool; + type: Type; + dimensions: int32; + name: string; +} + enum ExternFuncArgumentType: ubyte { UndefinedArg, FuncArg, @@ -701,6 +708,7 @@ table Pipeline { requirements: [Stmt]; func_names_in_order: [string]; parameters: [Parameter]; + external_parameters: [ExternalParameter]; buffers: [Buffer]; } diff --git a/tutorial/lesson_23_serialization.cpp b/tutorial/lesson_23_serialization.cpp index a01de5f916fd..f383debbcb7f 100644 --- a/tutorial/lesson_23_serialization.cpp +++ b/tutorial/lesson_23_serialization.cpp @@ -108,11 +108,9 @@ int main(int argc, char **argv) { { // Lets do the same thing again ... construct a new pipeline from scratch by deserializing the file we wrote to disk - // FIXME: We shouldn't have to populate the params ... but passing an empty map triggers an error in deserialize - // for a missing input param - std::map params; - ImageParam input(UInt(8), 3, "input"); - params.insert({"input", input.parameter()}); + // First we can deserialize the external parameters (useful in the event we want to remap them + // and replace the definitions with our own user parameter definitions) + std::map params = deserialize_parameters("blur.hlpipe"); // Now deserialize the pipeline from file Pipeline blur_pipeline = deserialize_pipeline("blur.hlpipe", params); From 9ce5fd6cca6f7d99035817e7ae4afb5e8989374f Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 28 Nov 2023 09:54:03 -0500 Subject: [PATCH 13/15] [WebGPU] Update to latest native headers (#7932) * [WebGPU] Update to latest native headers * Update mini_webgpu.h with latest version from Dawn * Document this process * Remove an argument from wgpuQueueOnSubmittedWorkDone Fixes #7581 * [WebGPU] Note that wgpu is not yet supported * [WebGPU] Add https:// to external links in README * update to commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4 * Update mini_webgpu.h --------- Co-authored-by: Steven Johnson --- README_webgpu.md | 26 ++- src/runtime/mini_webgpu.h | 402 +++++++++++++++++++++++--------------- src/runtime/webgpu.cpp | 2 +- 3 files changed, 269 insertions(+), 161 deletions(-) diff --git a/README_webgpu.md b/README_webgpu.md index 6fba268000c7..684d4b966488 100644 --- a/README_webgpu.md +++ b/README_webgpu.md @@ -36,7 +36,7 @@ When invoking `emcc` to link Halide-generated objects, include these flags: `-s USE_WEBGPU=1 -s ASYNCIFY`. Tests that use AOT compilation can be run using a native WebGPU implementation -that has Node.js bindings, such as [Dawn](dawn.googlesource.com/dawn/). +that has Node.js bindings, such as [Dawn](https://dawn.googlesource.com/dawn/). You must set an environment variable named `HL_WEBGPU_NODE_BINDINGS` that has an absolute path to the bindings to run these tests, e.g. `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`. @@ -47,13 +47,18 @@ JIT compilation is not supported when using WebGPU with WASM. ## Running natively: `HL_TARGET=host-webgpu` -> _Tested with top-of-tree Dawn as of 2023-03-14._ +> _Tested with top-of-tree Dawn as of 2023-11-27 [commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4]._ For testing purposes, Halide can also target native WebGPU libraries, such as -[Dawn](dawn.googlesource.com/dawn/) or [wgpu](github.com/gfx-rs/wgpu). +[Dawn](https://dawn.googlesource.com/dawn/) or +[wgpu](https://github.com/gfx-rs/wgpu). This is currently the only path that can run the JIT correctness tests. See [below](#setting-up-dawn) for instructions on building Dawn. +> Note that as of 2023-11-27, wgpu is not supported due to +> [lacking `override` support for WGSL](https://github.com/gfx-rs/wgpu/issues/1762) +> which we require > in order to set GPU block sizes. + When targeting WebGPU with a native target, Halide defaults to looking for a build of Dawn (with several common names and suffixes); you can override this by setting the `HL_WEBGPU_NATIVE_LIB` environment variable to the absolute path @@ -71,7 +76,7 @@ will be selected based on the Halide target specified. Building Dawn's Node.js bindings currently requires using CMake. -First, [install `depot_tools`](commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) and add it to the +First, [install `depot_tools`](https://commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) and add it to the `PATH` environment variable. Next, get Dawn and its dependencies: @@ -108,3 +113,16 @@ This will produce the following artifacts: These paths can then be used for the `HL_WEBGPU_NODE_BINDINGS` and `HL_WEBGPU_NATIVE_LIB` environment variables when using Halide. + +## Updating mini_webgpu.h + +The recommended method for updating `mini_webgpu.h` is to copy the +`gen/include/dawn/webgpu.h` file from the Dawn build directory, then: +- Restore the `// clang-format {off,on}` lines. +- Comment out the `#include ` lines. +- Remove the `void` parameter from the `WGPUProc` declaration. + +This guarantees a version of the WebGPU header that is compatible with Dawn. +When the native API eventually stabilizes, it should be possible to obtain a +header from the `webgpu-native` GitHub organization that will be compatible +with Dawn, wgpu, and Emscripten. diff --git a/src/runtime/mini_webgpu.h b/src/runtime/mini_webgpu.h index a3ebc9317572..5a766d1a80c3 100644 --- a/src/runtime/mini_webgpu.h +++ b/src/runtime/mini_webgpu.h @@ -69,19 +69,19 @@ #define WGPU_NULLABLE #endif -#define WGPU_BREAKING_CHANGE_COUNT_RENAME - // Don't include these in Halide runtime // #include // #include -#define WGPU_ARRAY_LAYER_COUNT_UNDEFINED (0xffffffffUL) -#define WGPU_COPY_STRIDE_UNDEFINED (0xffffffffUL) -#define WGPU_LIMIT_U32_UNDEFINED (0xffffffffUL) -#define WGPU_LIMIT_U64_UNDEFINED (0xffffffffffffffffULL) -#define WGPU_MIP_LEVEL_COUNT_UNDEFINED (0xffffffffUL) +#define WGPU_ARRAY_LAYER_COUNT_UNDEFINED UINT32_MAX +#define WGPU_COPY_STRIDE_UNDEFINED UINT32_MAX +#define WGPU_DEPTH_SLICE_UNDEFINED (0xffffffffUL) +#define WGPU_LIMIT_U32_UNDEFINED UINT32_MAX +#define WGPU_LIMIT_U64_UNDEFINED UINT64_MAX +#define WGPU_MIP_LEVEL_COUNT_UNDEFINED UINT32_MAX +#define WGPU_QUERY_SET_INDEX_UNDEFINED UINT32_MAX #define WGPU_WHOLE_MAP_SIZE SIZE_MAX -#define WGPU_WHOLE_SIZE (0xffffffffffffffffULL) +#define WGPU_WHOLE_SIZE UINT64_MAX typedef uint32_t WGPUFlags; typedef uint32_t WGPUBool; @@ -119,27 +119,32 @@ struct WGPUBindGroupEntry; struct WGPUBlendComponent; struct WGPUBufferBindingLayout; struct WGPUBufferDescriptor; +struct WGPUBufferHostMappedPointer; +struct WGPUBufferMapCallbackInfo; struct WGPUColor; struct WGPUCommandBufferDescriptor; struct WGPUCommandEncoderDescriptor; struct WGPUCompilationMessage; -struct WGPUComputePassTimestampWrite; +struct WGPUComputePassTimestampWrites; struct WGPUConstantEntry; struct WGPUCopyTextureForBrowserOptions; struct WGPUDawnAdapterPropertiesPowerPreference; struct WGPUDawnBufferDescriptorErrorInfoFromWireClient; struct WGPUDawnCacheDeviceDescriptor; struct WGPUDawnEncoderInternalUsageDescriptor; +struct WGPUDawnExperimentalSubgroupLimits; struct WGPUDawnMultisampleStateRenderToSingleSampled; struct WGPUDawnRenderPassColorAttachmentRenderToSingleSampled; struct WGPUDawnShaderModuleSPIRVOptionsDescriptor; struct WGPUDawnTextureInternalUsageDescriptor; struct WGPUDawnTogglesDescriptor; +struct WGPUDepthStencilStateDepthWriteDefinedDawn; struct WGPUExtent2D; struct WGPUExtent3D; struct WGPUExternalTextureBindingEntry; struct WGPUExternalTextureBindingLayout; -struct WGPUInstanceDescriptor; +struct WGPUFuture; +struct WGPUInstanceFeatures; struct WGPULimits; struct WGPUMultisampleState; struct WGPUOrigin2D; @@ -150,11 +155,13 @@ struct WGPUPrimitiveDepthClipControl; struct WGPUPrimitiveState; struct WGPUQuerySetDescriptor; struct WGPUQueueDescriptor; +struct WGPUQueueWorkDoneCallbackInfo; struct WGPURenderBundleDescriptor; struct WGPURenderBundleEncoderDescriptor; struct WGPURenderPassDepthStencilAttachment; struct WGPURenderPassDescriptorMaxDrawCount; -struct WGPURenderPassTimestampWrite; +struct WGPURenderPassTimestampWrites; +struct WGPURequestAdapterCallbackInfo; struct WGPURequestAdapterOptions; struct WGPUSamplerBindingLayout; struct WGPUSamplerDescriptor; @@ -199,6 +206,7 @@ struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel; struct WGPUSurfaceDescriptorFromXlibWindow; struct WGPUSwapChainDescriptor; struct WGPUTextureBindingLayout; +struct WGPUTextureBindingViewDimensionDescriptor; struct WGPUTextureDataLayout; struct WGPUTextureViewDescriptor; struct WGPUVertexAttribute; @@ -209,9 +217,11 @@ struct WGPUCompilationInfo; struct WGPUComputePassDescriptor; struct WGPUDepthStencilState; struct WGPUExternalTextureDescriptor; +struct WGPUFutureWaitInfo; struct WGPUImageCopyBuffer; struct WGPUImageCopyExternalTexture; struct WGPUImageCopyTexture; +struct WGPUInstanceDescriptor; struct WGPUPipelineLayoutPixelLocalStorage; struct WGPUProgrammableStageDescriptor; struct WGPURenderPassColorAttachment; @@ -325,6 +335,13 @@ typedef enum WGPUBufferMapState { WGPUBufferMapState_Force32 = 0x7FFFFFFF } WGPUBufferMapState WGPU_ENUM_ATTRIBUTE; +typedef enum WGPUCallbackMode { + WGPUCallbackMode_WaitAnyOnly = 0x00000000, + WGPUCallbackMode_AllowProcessEvents = 0x00000001, + WGPUCallbackMode_AllowSpontaneous = 0x00000002, + WGPUCallbackMode_Force32 = 0x7FFFFFFF +} WGPUCallbackMode WGPU_ENUM_ATTRIBUTE; + typedef enum WGPUCompareFunction { WGPUCompareFunction_Undefined = 0x00000000, WGPUCompareFunction_Never = 0x00000001, @@ -353,12 +370,6 @@ typedef enum WGPUCompilationMessageType { WGPUCompilationMessageType_Force32 = 0x7FFFFFFF } WGPUCompilationMessageType WGPU_ENUM_ATTRIBUTE; -typedef enum WGPUComputePassTimestampLocation { - WGPUComputePassTimestampLocation_Beginning = 0x00000000, - WGPUComputePassTimestampLocation_End = 0x00000001, - WGPUComputePassTimestampLocation_Force32 = 0x7FFFFFFF -} WGPUComputePassTimestampLocation WGPU_ENUM_ATTRIBUTE; - typedef enum WGPUCreatePipelineAsyncStatus { WGPUCreatePipelineAsyncStatus_Success = 0x00000000, WGPUCreatePipelineAsyncStatus_ValidationError = 0x00000001, @@ -412,20 +423,19 @@ typedef enum WGPUFeatureName { WGPUFeatureName_DepthClipControl = 0x00000001, WGPUFeatureName_Depth32FloatStencil8 = 0x00000002, WGPUFeatureName_TimestampQuery = 0x00000003, - WGPUFeatureName_PipelineStatisticsQuery = 0x00000004, - WGPUFeatureName_TextureCompressionBC = 0x00000005, - WGPUFeatureName_TextureCompressionETC2 = 0x00000006, - WGPUFeatureName_TextureCompressionASTC = 0x00000007, - WGPUFeatureName_IndirectFirstInstance = 0x00000008, - WGPUFeatureName_ShaderF16 = 0x00000009, - WGPUFeatureName_RG11B10UfloatRenderable = 0x0000000A, - WGPUFeatureName_BGRA8UnormStorage = 0x0000000B, - WGPUFeatureName_Float32Filterable = 0x0000000C, + WGPUFeatureName_TextureCompressionBC = 0x00000004, + WGPUFeatureName_TextureCompressionETC2 = 0x00000005, + WGPUFeatureName_TextureCompressionASTC = 0x00000006, + WGPUFeatureName_IndirectFirstInstance = 0x00000007, + WGPUFeatureName_ShaderF16 = 0x00000008, + WGPUFeatureName_RG11B10UfloatRenderable = 0x00000009, + WGPUFeatureName_BGRA8UnormStorage = 0x0000000A, + WGPUFeatureName_Float32Filterable = 0x0000000B, WGPUFeatureName_DawnInternalUsages = 0x000003EA, WGPUFeatureName_DawnMultiPlanarFormats = 0x000003EB, WGPUFeatureName_DawnNative = 0x000003EC, WGPUFeatureName_ChromiumExperimentalDp4a = 0x000003ED, - WGPUFeatureName_TimestampQueryInsidePasses = 0x000003EE, + WGPUFeatureName_ChromiumExperimentalTimestampQueryInsidePasses = 0x000003EE, WGPUFeatureName_ImplicitDeviceSynchronization = 0x000003EF, WGPUFeatureName_SurfaceCapabilities = 0x000003F0, WGPUFeatureName_TransientAttachments = 0x000003F1, @@ -435,11 +445,16 @@ typedef enum WGPUFeatureName { WGPUFeatureName_ANGLETextureSharing = 0x000003F5, WGPUFeatureName_ChromiumExperimentalSubgroups = 0x000003F6, WGPUFeatureName_ChromiumExperimentalSubgroupUniformControlFlow = 0x000003F7, - WGPUFeatureName_ChromiumExperimentalReadWriteStorageTexture = 0x000003F8, WGPUFeatureName_PixelLocalStorageCoherent = 0x000003F9, WGPUFeatureName_PixelLocalStorageNonCoherent = 0x000003FA, WGPUFeatureName_Norm16TextureFormats = 0x000003FB, WGPUFeatureName_MultiPlanarFormatExtendedUsages = 0x000003FC, + WGPUFeatureName_MultiPlanarFormatP010 = 0x000003FD, + WGPUFeatureName_HostMappedPointer = 0x000003FE, + WGPUFeatureName_MultiPlanarRenderTargets = 0x000003FF, + WGPUFeatureName_MultiPlanarFormatNv12a = 0x00000400, + WGPUFeatureName_FramebufferFetch = 0x00000401, + WGPUFeatureName_BufferMapExtendedUsages = 0x00000402, WGPUFeatureName_SharedTextureMemoryVkDedicatedAllocation = 0x0000044C, WGPUFeatureName_SharedTextureMemoryAHardwareBuffer = 0x0000044D, WGPUFeatureName_SharedTextureMemoryDmaBuf = 0x0000044E, @@ -497,15 +512,6 @@ typedef enum WGPUMipmapFilterMode { WGPUMipmapFilterMode_Force32 = 0x7FFFFFFF } WGPUMipmapFilterMode WGPU_ENUM_ATTRIBUTE; -typedef enum WGPUPipelineStatisticName { - WGPUPipelineStatisticName_VertexShaderInvocations = 0x00000000, - WGPUPipelineStatisticName_ClipperInvocations = 0x00000001, - WGPUPipelineStatisticName_ClipperPrimitivesOut = 0x00000002, - WGPUPipelineStatisticName_FragmentShaderInvocations = 0x00000003, - WGPUPipelineStatisticName_ComputeShaderInvocations = 0x00000004, - WGPUPipelineStatisticName_Force32 = 0x7FFFFFFF -} WGPUPipelineStatisticName WGPU_ENUM_ATTRIBUTE; - typedef enum WGPUPowerPreference { WGPUPowerPreference_Undefined = 0x00000000, WGPUPowerPreference_LowPower = 0x00000001, @@ -531,8 +537,7 @@ typedef enum WGPUPrimitiveTopology { typedef enum WGPUQueryType { WGPUQueryType_Occlusion = 0x00000000, - WGPUQueryType_PipelineStatistics = 0x00000001, - WGPUQueryType_Timestamp = 0x00000002, + WGPUQueryType_Timestamp = 0x00000001, WGPUQueryType_Force32 = 0x7FFFFFFF } WGPUQueryType WGPU_ENUM_ATTRIBUTE; @@ -544,12 +549,6 @@ typedef enum WGPUQueueWorkDoneStatus { WGPUQueueWorkDoneStatus_Force32 = 0x7FFFFFFF } WGPUQueueWorkDoneStatus WGPU_ENUM_ATTRIBUTE; -typedef enum WGPURenderPassTimestampLocation { - WGPURenderPassTimestampLocation_Beginning = 0x00000000, - WGPURenderPassTimestampLocation_End = 0x00000001, - WGPURenderPassTimestampLocation_Force32 = 0x7FFFFFFF -} WGPURenderPassTimestampLocation WGPU_ENUM_ATTRIBUTE; - typedef enum WGPURequestAdapterStatus { WGPURequestAdapterStatus_Success = 0x00000000, WGPURequestAdapterStatus_Unavailable = 0x00000001, @@ -581,6 +580,8 @@ typedef enum WGPUSType { WGPUSType_ExternalTextureBindingLayout = 0x0000000D, WGPUSType_SurfaceDescriptorFromWindowsSwapChainPanel = 0x0000000E, WGPUSType_RenderPassDescriptorMaxDrawCount = 0x0000000F, + WGPUSType_DepthStencilStateDepthWriteDefinedDawn = 0x00000010, + WGPUSType_TextureBindingViewDimensionDescriptor = 0x00000011, WGPUSType_DawnTextureInternalUsageDescriptor = 0x000003E8, WGPUSType_DawnEncoderInternalUsageDescriptor = 0x000003EB, WGPUSType_DawnInstanceDescriptor = 0x000003EC, @@ -591,10 +592,13 @@ typedef enum WGPUSType { WGPUSType_DawnShaderModuleSPIRVOptionsDescriptor = 0x000003F1, WGPUSType_RequestAdapterOptionsLUID = 0x000003F2, WGPUSType_RequestAdapterOptionsGetGLProc = 0x000003F3, - WGPUSType_DawnMultisampleStateRenderToSingleSampled = 0x000003F4, - WGPUSType_DawnRenderPassColorAttachmentRenderToSingleSampled = 0x000003F5, - WGPUSType_RenderPassPixelLocalStorage = 0x000003F6, - WGPUSType_PipelineLayoutPixelLocalStorage = 0x000003F7, + WGPUSType_RequestAdapterOptionsD3D11Device = 0x000003F4, + WGPUSType_DawnMultisampleStateRenderToSingleSampled = 0x000003F5, + WGPUSType_DawnRenderPassColorAttachmentRenderToSingleSampled = 0x000003F6, + WGPUSType_RenderPassPixelLocalStorage = 0x000003F7, + WGPUSType_PipelineLayoutPixelLocalStorage = 0x000003F8, + WGPUSType_BufferHostMappedPointer = 0x000003F9, + WGPUSType_DawnExperimentalSubgroupLimits = 0x000003FA, WGPUSType_SharedTextureMemoryVkImageDescriptor = 0x0000044C, WGPUSType_SharedTextureMemoryVkDedicatedAllocationDescriptor = 0x0000044D, WGPUSType_SharedTextureMemoryAHardwareBufferDescriptor = 0x0000044E, @@ -673,6 +677,7 @@ typedef enum WGPUTextureAspect { WGPUTextureAspect_DepthOnly = 0x00000002, WGPUTextureAspect_Plane0Only = 0x00000003, WGPUTextureAspect_Plane1Only = 0x00000004, + WGPUTextureAspect_Plane2Only = 0x00000005, WGPUTextureAspect_Force32 = 0x7FFFFFFF } WGPUTextureAspect WGPU_ENUM_ATTRIBUTE; @@ -709,83 +714,86 @@ typedef enum WGPUTextureFormat { WGPUTextureFormat_RGBA8Sint = 0x00000016, WGPUTextureFormat_BGRA8Unorm = 0x00000017, WGPUTextureFormat_BGRA8UnormSrgb = 0x00000018, - WGPUTextureFormat_RGB10A2Unorm = 0x00000019, - WGPUTextureFormat_RG11B10Ufloat = 0x0000001A, - WGPUTextureFormat_RGB9E5Ufloat = 0x0000001B, - WGPUTextureFormat_RG32Float = 0x0000001C, - WGPUTextureFormat_RG32Uint = 0x0000001D, - WGPUTextureFormat_RG32Sint = 0x0000001E, - WGPUTextureFormat_RGBA16Uint = 0x0000001F, - WGPUTextureFormat_RGBA16Sint = 0x00000020, - WGPUTextureFormat_RGBA16Float = 0x00000021, - WGPUTextureFormat_RGBA32Float = 0x00000022, - WGPUTextureFormat_RGBA32Uint = 0x00000023, - WGPUTextureFormat_RGBA32Sint = 0x00000024, - WGPUTextureFormat_Stencil8 = 0x00000025, - WGPUTextureFormat_Depth16Unorm = 0x00000026, - WGPUTextureFormat_Depth24Plus = 0x00000027, - WGPUTextureFormat_Depth24PlusStencil8 = 0x00000028, - WGPUTextureFormat_Depth32Float = 0x00000029, - WGPUTextureFormat_Depth32FloatStencil8 = 0x0000002A, - WGPUTextureFormat_BC1RGBAUnorm = 0x0000002B, - WGPUTextureFormat_BC1RGBAUnormSrgb = 0x0000002C, - WGPUTextureFormat_BC2RGBAUnorm = 0x0000002D, - WGPUTextureFormat_BC2RGBAUnormSrgb = 0x0000002E, - WGPUTextureFormat_BC3RGBAUnorm = 0x0000002F, - WGPUTextureFormat_BC3RGBAUnormSrgb = 0x00000030, - WGPUTextureFormat_BC4RUnorm = 0x00000031, - WGPUTextureFormat_BC4RSnorm = 0x00000032, - WGPUTextureFormat_BC5RGUnorm = 0x00000033, - WGPUTextureFormat_BC5RGSnorm = 0x00000034, - WGPUTextureFormat_BC6HRGBUfloat = 0x00000035, - WGPUTextureFormat_BC6HRGBFloat = 0x00000036, - WGPUTextureFormat_BC7RGBAUnorm = 0x00000037, - WGPUTextureFormat_BC7RGBAUnormSrgb = 0x00000038, - WGPUTextureFormat_ETC2RGB8Unorm = 0x00000039, - WGPUTextureFormat_ETC2RGB8UnormSrgb = 0x0000003A, - WGPUTextureFormat_ETC2RGB8A1Unorm = 0x0000003B, - WGPUTextureFormat_ETC2RGB8A1UnormSrgb = 0x0000003C, - WGPUTextureFormat_ETC2RGBA8Unorm = 0x0000003D, - WGPUTextureFormat_ETC2RGBA8UnormSrgb = 0x0000003E, - WGPUTextureFormat_EACR11Unorm = 0x0000003F, - WGPUTextureFormat_EACR11Snorm = 0x00000040, - WGPUTextureFormat_EACRG11Unorm = 0x00000041, - WGPUTextureFormat_EACRG11Snorm = 0x00000042, - WGPUTextureFormat_ASTC4x4Unorm = 0x00000043, - WGPUTextureFormat_ASTC4x4UnormSrgb = 0x00000044, - WGPUTextureFormat_ASTC5x4Unorm = 0x00000045, - WGPUTextureFormat_ASTC5x4UnormSrgb = 0x00000046, - WGPUTextureFormat_ASTC5x5Unorm = 0x00000047, - WGPUTextureFormat_ASTC5x5UnormSrgb = 0x00000048, - WGPUTextureFormat_ASTC6x5Unorm = 0x00000049, - WGPUTextureFormat_ASTC6x5UnormSrgb = 0x0000004A, - WGPUTextureFormat_ASTC6x6Unorm = 0x0000004B, - WGPUTextureFormat_ASTC6x6UnormSrgb = 0x0000004C, - WGPUTextureFormat_ASTC8x5Unorm = 0x0000004D, - WGPUTextureFormat_ASTC8x5UnormSrgb = 0x0000004E, - WGPUTextureFormat_ASTC8x6Unorm = 0x0000004F, - WGPUTextureFormat_ASTC8x6UnormSrgb = 0x00000050, - WGPUTextureFormat_ASTC8x8Unorm = 0x00000051, - WGPUTextureFormat_ASTC8x8UnormSrgb = 0x00000052, - WGPUTextureFormat_ASTC10x5Unorm = 0x00000053, - WGPUTextureFormat_ASTC10x5UnormSrgb = 0x00000054, - WGPUTextureFormat_ASTC10x6Unorm = 0x00000055, - WGPUTextureFormat_ASTC10x6UnormSrgb = 0x00000056, - WGPUTextureFormat_ASTC10x8Unorm = 0x00000057, - WGPUTextureFormat_ASTC10x8UnormSrgb = 0x00000058, - WGPUTextureFormat_ASTC10x10Unorm = 0x00000059, - WGPUTextureFormat_ASTC10x10UnormSrgb = 0x0000005A, - WGPUTextureFormat_ASTC12x10Unorm = 0x0000005B, - WGPUTextureFormat_ASTC12x10UnormSrgb = 0x0000005C, - WGPUTextureFormat_ASTC12x12Unorm = 0x0000005D, - WGPUTextureFormat_ASTC12x12UnormSrgb = 0x0000005E, - WGPUTextureFormat_R16Unorm = 0x0000005F, - WGPUTextureFormat_RG16Unorm = 0x00000060, - WGPUTextureFormat_RGBA16Unorm = 0x00000061, - WGPUTextureFormat_R16Snorm = 0x00000062, - WGPUTextureFormat_RG16Snorm = 0x00000063, - WGPUTextureFormat_RGBA16Snorm = 0x00000064, - WGPUTextureFormat_R8BG8Biplanar420Unorm = 0x00000065, + WGPUTextureFormat_RGB10A2Uint = 0x00000019, + WGPUTextureFormat_RGB10A2Unorm = 0x0000001A, + WGPUTextureFormat_RG11B10Ufloat = 0x0000001B, + WGPUTextureFormat_RGB9E5Ufloat = 0x0000001C, + WGPUTextureFormat_RG32Float = 0x0000001D, + WGPUTextureFormat_RG32Uint = 0x0000001E, + WGPUTextureFormat_RG32Sint = 0x0000001F, + WGPUTextureFormat_RGBA16Uint = 0x00000020, + WGPUTextureFormat_RGBA16Sint = 0x00000021, + WGPUTextureFormat_RGBA16Float = 0x00000022, + WGPUTextureFormat_RGBA32Float = 0x00000023, + WGPUTextureFormat_RGBA32Uint = 0x00000024, + WGPUTextureFormat_RGBA32Sint = 0x00000025, + WGPUTextureFormat_Stencil8 = 0x00000026, + WGPUTextureFormat_Depth16Unorm = 0x00000027, + WGPUTextureFormat_Depth24Plus = 0x00000028, + WGPUTextureFormat_Depth24PlusStencil8 = 0x00000029, + WGPUTextureFormat_Depth32Float = 0x0000002A, + WGPUTextureFormat_Depth32FloatStencil8 = 0x0000002B, + WGPUTextureFormat_BC1RGBAUnorm = 0x0000002C, + WGPUTextureFormat_BC1RGBAUnormSrgb = 0x0000002D, + WGPUTextureFormat_BC2RGBAUnorm = 0x0000002E, + WGPUTextureFormat_BC2RGBAUnormSrgb = 0x0000002F, + WGPUTextureFormat_BC3RGBAUnorm = 0x00000030, + WGPUTextureFormat_BC3RGBAUnormSrgb = 0x00000031, + WGPUTextureFormat_BC4RUnorm = 0x00000032, + WGPUTextureFormat_BC4RSnorm = 0x00000033, + WGPUTextureFormat_BC5RGUnorm = 0x00000034, + WGPUTextureFormat_BC5RGSnorm = 0x00000035, + WGPUTextureFormat_BC6HRGBUfloat = 0x00000036, + WGPUTextureFormat_BC6HRGBFloat = 0x00000037, + WGPUTextureFormat_BC7RGBAUnorm = 0x00000038, + WGPUTextureFormat_BC7RGBAUnormSrgb = 0x00000039, + WGPUTextureFormat_ETC2RGB8Unorm = 0x0000003A, + WGPUTextureFormat_ETC2RGB8UnormSrgb = 0x0000003B, + WGPUTextureFormat_ETC2RGB8A1Unorm = 0x0000003C, + WGPUTextureFormat_ETC2RGB8A1UnormSrgb = 0x0000003D, + WGPUTextureFormat_ETC2RGBA8Unorm = 0x0000003E, + WGPUTextureFormat_ETC2RGBA8UnormSrgb = 0x0000003F, + WGPUTextureFormat_EACR11Unorm = 0x00000040, + WGPUTextureFormat_EACR11Snorm = 0x00000041, + WGPUTextureFormat_EACRG11Unorm = 0x00000042, + WGPUTextureFormat_EACRG11Snorm = 0x00000043, + WGPUTextureFormat_ASTC4x4Unorm = 0x00000044, + WGPUTextureFormat_ASTC4x4UnormSrgb = 0x00000045, + WGPUTextureFormat_ASTC5x4Unorm = 0x00000046, + WGPUTextureFormat_ASTC5x4UnormSrgb = 0x00000047, + WGPUTextureFormat_ASTC5x5Unorm = 0x00000048, + WGPUTextureFormat_ASTC5x5UnormSrgb = 0x00000049, + WGPUTextureFormat_ASTC6x5Unorm = 0x0000004A, + WGPUTextureFormat_ASTC6x5UnormSrgb = 0x0000004B, + WGPUTextureFormat_ASTC6x6Unorm = 0x0000004C, + WGPUTextureFormat_ASTC6x6UnormSrgb = 0x0000004D, + WGPUTextureFormat_ASTC8x5Unorm = 0x0000004E, + WGPUTextureFormat_ASTC8x5UnormSrgb = 0x0000004F, + WGPUTextureFormat_ASTC8x6Unorm = 0x00000050, + WGPUTextureFormat_ASTC8x6UnormSrgb = 0x00000051, + WGPUTextureFormat_ASTC8x8Unorm = 0x00000052, + WGPUTextureFormat_ASTC8x8UnormSrgb = 0x00000053, + WGPUTextureFormat_ASTC10x5Unorm = 0x00000054, + WGPUTextureFormat_ASTC10x5UnormSrgb = 0x00000055, + WGPUTextureFormat_ASTC10x6Unorm = 0x00000056, + WGPUTextureFormat_ASTC10x6UnormSrgb = 0x00000057, + WGPUTextureFormat_ASTC10x8Unorm = 0x00000058, + WGPUTextureFormat_ASTC10x8UnormSrgb = 0x00000059, + WGPUTextureFormat_ASTC10x10Unorm = 0x0000005A, + WGPUTextureFormat_ASTC10x10UnormSrgb = 0x0000005B, + WGPUTextureFormat_ASTC12x10Unorm = 0x0000005C, + WGPUTextureFormat_ASTC12x10UnormSrgb = 0x0000005D, + WGPUTextureFormat_ASTC12x12Unorm = 0x0000005E, + WGPUTextureFormat_ASTC12x12UnormSrgb = 0x0000005F, + WGPUTextureFormat_R16Unorm = 0x00000060, + WGPUTextureFormat_RG16Unorm = 0x00000061, + WGPUTextureFormat_RGBA16Unorm = 0x00000062, + WGPUTextureFormat_R16Snorm = 0x00000063, + WGPUTextureFormat_RG16Snorm = 0x00000064, + WGPUTextureFormat_RGBA16Snorm = 0x00000065, + WGPUTextureFormat_R8BG8Biplanar420Unorm = 0x00000066, + WGPUTextureFormat_R10X6BG10X6Biplanar420Unorm = 0x00000067, + WGPUTextureFormat_R8BG8A8Triplanar420Unorm = 0x00000068, WGPUTextureFormat_Force32 = 0x7FFFFFFF } WGPUTextureFormat WGPU_ENUM_ATTRIBUTE; @@ -842,6 +850,7 @@ typedef enum WGPUVertexFormat { WGPUVertexFormat_Sint32x2 = 0x0000001C, WGPUVertexFormat_Sint32x3 = 0x0000001D, WGPUVertexFormat_Sint32x4 = 0x0000001E, + WGPUVertexFormat_Unorm10_10_10_2 = 0x0000001F, WGPUVertexFormat_Force32 = 0x7FFFFFFF } WGPUVertexFormat WGPU_ENUM_ATTRIBUTE; @@ -852,6 +861,16 @@ typedef enum WGPUVertexStepMode { WGPUVertexStepMode_Force32 = 0x7FFFFFFF } WGPUVertexStepMode WGPU_ENUM_ATTRIBUTE; +typedef enum WGPUWaitStatus { + WGPUWaitStatus_Success = 0x00000000, + WGPUWaitStatus_TimedOut = 0x00000001, + WGPUWaitStatus_UnsupportedTimeout = 0x00000002, + WGPUWaitStatus_UnsupportedCount = 0x00000003, + WGPUWaitStatus_UnsupportedMixedSources = 0x00000004, + WGPUWaitStatus_Unknown = 0x00000005, + WGPUWaitStatus_Force32 = 0x7FFFFFFF +} WGPUWaitStatus WGPU_ENUM_ATTRIBUTE; + typedef enum WGPUBufferUsage { WGPUBufferUsage_None = 0x00000000, WGPUBufferUsage_MapRead = 0x00000001, @@ -910,6 +929,7 @@ typedef enum WGPUTextureUsage { typedef WGPUFlags WGPUTextureUsageFlags WGPU_ENUM_ATTRIBUTE; typedef void (*WGPUBufferMapCallback)(WGPUBufferMapAsyncStatus status, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef void (*WGPUCallback)(void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, struct WGPUCompilationInfo const * compilationInfo, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; @@ -975,6 +995,21 @@ typedef struct WGPUBufferDescriptor { WGPUBool mappedAtCreation; } WGPUBufferDescriptor WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUBufferDescriptor +typedef struct WGPUBufferHostMappedPointer { + WGPUChainedStruct chain; + void * pointer; + WGPUCallback disposeCallback; + void * userdata; +} WGPUBufferHostMappedPointer WGPU_STRUCTURE_ATTRIBUTE; + +typedef struct WGPUBufferMapCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPUBufferMapCallback callback; + void * userdata; +} WGPUBufferMapCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUColor { double r; double g; @@ -1005,11 +1040,11 @@ typedef struct WGPUCompilationMessage { uint64_t utf16Length; } WGPUCompilationMessage WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPUComputePassTimestampWrite { +typedef struct WGPUComputePassTimestampWrites { WGPUQuerySet querySet; - uint32_t queryIndex; - WGPUComputePassTimestampLocation location; -} WGPUComputePassTimestampWrite WGPU_STRUCTURE_ATTRIBUTE; + uint32_t beginningOfPassWriteIndex; + uint32_t endOfPassWriteIndex; +} WGPUComputePassTimestampWrites WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPUConstantEntry { WGPUChainedStruct const * nextInChain; @@ -1053,6 +1088,13 @@ typedef struct WGPUDawnEncoderInternalUsageDescriptor { WGPUBool useInternalUsages; } WGPUDawnEncoderInternalUsageDescriptor WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUSupportedLimits +typedef struct WGPUDawnExperimentalSubgroupLimits { + WGPUChainedStructOut chain; + uint32_t minSubgroupSize; + uint32_t maxSubgroupSize; +} WGPUDawnExperimentalSubgroupLimits WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUMultisampleState typedef struct WGPUDawnMultisampleStateRenderToSingleSampled { WGPUChainedStruct chain; @@ -1088,6 +1130,12 @@ typedef struct WGPUDawnTogglesDescriptor { const char* const * disabledToggles; } WGPUDawnTogglesDescriptor WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUDepthStencilState +typedef struct WGPUDepthStencilStateDepthWriteDefinedDawn { + WGPUChainedStruct chain; + WGPUBool depthWriteDefined; +} WGPUDepthStencilStateDepthWriteDefinedDawn WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUExtent2D { uint32_t width; uint32_t height; @@ -1110,9 +1158,15 @@ typedef struct WGPUExternalTextureBindingLayout { WGPUChainedStruct chain; } WGPUExternalTextureBindingLayout WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPUInstanceDescriptor { +typedef struct WGPUFuture { + uint64_t id; +} WGPUFuture WGPU_STRUCTURE_ATTRIBUTE; + +typedef struct WGPUInstanceFeatures { WGPUChainedStruct const * nextInChain; -} WGPUInstanceDescriptor WGPU_STRUCTURE_ATTRIBUTE; + WGPUBool timedWaitAnyEnable; + size_t timedWaitAnyMaxCount; +} WGPUInstanceFeatures WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPULimits { uint32_t maxTextureDimension1D; @@ -1199,8 +1253,6 @@ typedef struct WGPUQuerySetDescriptor { WGPU_NULLABLE char const * label; WGPUQueryType type; uint32_t count; - WGPUPipelineStatisticName const * pipelineStatistics; - size_t pipelineStatisticCount; } WGPUQuerySetDescriptor WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPUQueueDescriptor { @@ -1208,6 +1260,13 @@ typedef struct WGPUQueueDescriptor { WGPU_NULLABLE char const * label; } WGPUQueueDescriptor WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUQueueWorkDoneCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPUQueueWorkDoneCallback callback; + void * userdata; +} WGPUQueueWorkDoneCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPURenderBundleDescriptor { WGPUChainedStruct const * nextInChain; WGPU_NULLABLE char const * label; @@ -1242,11 +1301,18 @@ typedef struct WGPURenderPassDescriptorMaxDrawCount { uint64_t maxDrawCount; } WGPURenderPassDescriptorMaxDrawCount WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPURenderPassTimestampWrite { +typedef struct WGPURenderPassTimestampWrites { WGPUQuerySet querySet; - uint32_t queryIndex; - WGPURenderPassTimestampLocation location; -} WGPURenderPassTimestampWrite WGPU_STRUCTURE_ATTRIBUTE; + uint32_t beginningOfPassWriteIndex; + uint32_t endOfPassWriteIndex; +} WGPURenderPassTimestampWrites WGPU_STRUCTURE_ATTRIBUTE; + +typedef struct WGPURequestAdapterCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPURequestAdapterCallback callback; + void * userdata; +} WGPURequestAdapterCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPURequestAdapterOptions { WGPUChainedStruct const * nextInChain; @@ -1522,7 +1588,7 @@ typedef struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel { typedef struct WGPUSurfaceDescriptorFromXlibWindow { WGPUChainedStruct chain; void * display; - uint32_t window; + uint64_t window; } WGPUSurfaceDescriptorFromXlibWindow WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPUSwapChainDescriptor { @@ -1542,6 +1608,12 @@ typedef struct WGPUTextureBindingLayout { WGPUBool multisampled; } WGPUTextureBindingLayout WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUTextureDescriptor +typedef struct WGPUTextureBindingViewDimensionDescriptor { + WGPUChainedStruct chain; + WGPUTextureViewDimension textureBindingViewDimension; +} WGPUTextureBindingViewDimensionDescriptor WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUTextureDataLayout { WGPUChainedStruct const * nextInChain; uint64_t offset; @@ -1599,8 +1671,7 @@ typedef struct WGPUCompilationInfo { typedef struct WGPUComputePassDescriptor { WGPUChainedStruct const * nextInChain; WGPU_NULLABLE char const * label; - size_t timestampWriteCount; - WGPUComputePassTimestampWrite const * timestampWrites; + WGPU_NULLABLE WGPUComputePassTimestampWrites const * timestampWrites; } WGPUComputePassDescriptor WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPUDepthStencilState { @@ -1633,6 +1704,11 @@ typedef struct WGPUExternalTextureDescriptor { WGPUExternalTextureRotation rotation; } WGPUExternalTextureDescriptor WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUFutureWaitInfo { + WGPUFuture future; + WGPUBool completed; +} WGPUFutureWaitInfo WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUImageCopyBuffer { WGPUChainedStruct const * nextInChain; WGPUTextureDataLayout layout; @@ -1654,10 +1730,15 @@ typedef struct WGPUImageCopyTexture { WGPUTextureAspect aspect; } WGPUImageCopyTexture WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUInstanceDescriptor { + WGPUChainedStruct const * nextInChain; + WGPUInstanceFeatures features; +} WGPUInstanceDescriptor WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUPipelineLayoutDescriptor typedef struct WGPUPipelineLayoutPixelLocalStorage { WGPUChainedStruct chain; - size_t totalPixelLocalStorageSize; + uint64_t totalPixelLocalStorageSize; size_t storageAttachmentCount; WGPUPipelineLayoutStorageAttachment const * storageAttachments; } WGPUPipelineLayoutPixelLocalStorage WGPU_STRUCTURE_ATTRIBUTE; @@ -1665,7 +1746,7 @@ typedef struct WGPUPipelineLayoutPixelLocalStorage { typedef struct WGPUProgrammableStageDescriptor { WGPUChainedStruct const * nextInChain; WGPUShaderModule module; - char const * entryPoint; + WGPU_NULLABLE char const * entryPoint; size_t constantCount; WGPUConstantEntry const * constants; } WGPUProgrammableStageDescriptor WGPU_STRUCTURE_ATTRIBUTE; @@ -1773,14 +1854,13 @@ typedef struct WGPURenderPassDescriptor { WGPURenderPassColorAttachment const * colorAttachments; WGPU_NULLABLE WGPURenderPassDepthStencilAttachment const * depthStencilAttachment; WGPU_NULLABLE WGPUQuerySet occlusionQuerySet; - size_t timestampWriteCount; - WGPURenderPassTimestampWrite const * timestampWrites; + WGPU_NULLABLE WGPURenderPassTimestampWrites const * timestampWrites; } WGPURenderPassDescriptor WGPU_STRUCTURE_ATTRIBUTE; // Can be chained in WGPURenderPassDescriptor typedef struct WGPURenderPassPixelLocalStorage { WGPUChainedStruct chain; - size_t totalPixelLocalStorageSize; + uint64_t totalPixelLocalStorageSize; size_t storageAttachmentCount; WGPURenderPassStorageAttachment const * storageAttachments; } WGPURenderPassPixelLocalStorage WGPU_STRUCTURE_ATTRIBUTE; @@ -1788,7 +1868,7 @@ typedef struct WGPURenderPassPixelLocalStorage { typedef struct WGPUVertexState { WGPUChainedStruct const * nextInChain; WGPUShaderModule module; - char const * entryPoint; + WGPU_NULLABLE char const * entryPoint; size_t constantCount; WGPUConstantEntry const * constants; size_t bufferCount; @@ -1798,7 +1878,7 @@ typedef struct WGPUVertexState { typedef struct WGPUFragmentState { WGPUChainedStruct const * nextInChain; WGPUShaderModule module; - char const * entryPoint; + WGPU_NULLABLE char const * entryPoint; size_t constantCount; WGPUConstantEntry const * constants; size_t targetCount; @@ -1824,6 +1904,7 @@ extern "C" { typedef void (*WGPUProcAdapterPropertiesFreeMembers)(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUBool (*WGPUProcGetInstanceFeatures)(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryEndAccessStateFreeMembers)(WGPUSharedTextureMemoryEndAccessState value) WGPU_FUNCTION_ATTRIBUTE; @@ -1856,6 +1937,7 @@ typedef void * (*WGPUProcBufferGetMappedRange)(WGPUBuffer buffer, size_t offset, typedef uint64_t (*WGPUProcBufferGetSize)(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUBufferUsageFlags (*WGPUProcBufferGetUsage)(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcBufferMapAsync)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcBufferMapAsyncF)(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcBufferSetLabel)(WGPUBuffer buffer, char const * label) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcBufferUnmap)(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcBufferReference)(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; @@ -1961,6 +2043,8 @@ typedef void (*WGPUProcExternalTextureRelease)(WGPUExternalTexture externalTextu typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceProcessEvents)(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcInstanceRequestAdapterF)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUWaitStatus (*WGPUProcInstanceWaitAny)(WGPUInstance instance, size_t futureCount, WGPUFutureWaitInfo * futures, uint64_t timeoutNS) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceReference)(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceRelease)(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; @@ -1980,7 +2064,8 @@ typedef void (*WGPUProcQuerySetRelease)(WGPUQuerySet querySet) WGPU_FUNCTION_ATT // Procs of Queue typedef void (*WGPUProcQueueCopyExternalTextureForBrowser)(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcQueueCopyTextureForBrowser)(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options) WGPU_FUNCTION_ATTRIBUTE; -typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef void (*WGPUProcQueueOnSubmittedWorkDone)(WGPUQueue queue, WGPUQueueWorkDoneCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcQueueOnSubmittedWorkDoneF)(WGPUQueue queue, WGPUQueueWorkDoneCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcQueueSetLabel)(WGPUQueue queue, char const * label) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcQueueSubmit)(WGPUQueue queue, size_t commandCount, WGPUCommandBuffer const * commands) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcQueueWriteBuffer)(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size) WGPU_FUNCTION_ATTRIBUTE; @@ -2059,9 +2144,9 @@ typedef void (*WGPUProcSharedFenceReference)(WGPUSharedFence sharedFence) WGPU_F typedef void (*WGPUProcSharedFenceRelease)(WGPUSharedFence sharedFence) WGPU_FUNCTION_ATTRIBUTE; // Procs of SharedTextureMemory -typedef void (*WGPUProcSharedTextureMemoryBeginAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryBeginAccessDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUBool (*WGPUProcSharedTextureMemoryBeginAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryBeginAccessDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUTexture (*WGPUProcSharedTextureMemoryCreateTexture)(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; -typedef void (*WGPUProcSharedTextureMemoryEndAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUBool (*WGPUProcSharedTextureMemoryEndAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryGetProperties)(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemorySetLabel)(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryReference)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; @@ -2104,8 +2189,9 @@ typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView) WGPU_FUN #if !defined(WGPU_SKIP_DECLARATIONS) WGPU_EXPORT void wgpuAdapterPropertiesFreeMembers(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE; -WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; -WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPU_NULLABLE WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUBool wgpuGetInstanceFeatures(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPU_NULLABLE WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryEndAccessStateFreeMembers(WGPUSharedTextureMemoryEndAccessState value) WGPU_FUNCTION_ATTRIBUTE; // Methods of Adapter @@ -2137,6 +2223,7 @@ WGPU_EXPORT void * wgpuBufferGetMappedRange(WGPUBuffer buffer, size_t offset, si WGPU_EXPORT uint64_t wgpuBufferGetSize(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUBufferUsageFlags wgpuBufferGetUsage(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuBufferMapAsync(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuBufferMapAsyncF(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuBufferSetLabel(WGPUBuffer buffer, char const * label) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuBufferUnmap(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuBufferReference(WGPUBuffer buffer) WGPU_FUNCTION_ATTRIBUTE; @@ -2242,6 +2329,8 @@ WGPU_EXPORT void wgpuExternalTextureRelease(WGPUExternalTexture externalTexture) WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceProcessEvents(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuInstanceRequestAdapterF(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUWaitStatus wgpuInstanceWaitAny(WGPUInstance instance, size_t futureCount, WGPUFutureWaitInfo * futures, uint64_t timeoutNS) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceReference(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceRelease(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; @@ -2261,7 +2350,8 @@ WGPU_EXPORT void wgpuQuerySetRelease(WGPUQuerySet querySet) WGPU_FUNCTION_ATTRIB // Methods of Queue WGPU_EXPORT void wgpuQueueCopyExternalTextureForBrowser(WGPUQueue queue, WGPUImageCopyExternalTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuQueueCopyTextureForBrowser(WGPUQueue queue, WGPUImageCopyTexture const * source, WGPUImageCopyTexture const * destination, WGPUExtent3D const * copySize, WGPUCopyTextureForBrowserOptions const * options) WGPU_FUNCTION_ATTRIBUTE; -WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, uint64_t signalValue, WGPUQueueWorkDoneCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT void wgpuQueueOnSubmittedWorkDone(WGPUQueue queue, WGPUQueueWorkDoneCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuQueueOnSubmittedWorkDoneF(WGPUQueue queue, WGPUQueueWorkDoneCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuQueueSetLabel(WGPUQueue queue, char const * label) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuQueueSubmit(WGPUQueue queue, size_t commandCount, WGPUCommandBuffer const * commands) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuQueueWriteBuffer(WGPUQueue queue, WGPUBuffer buffer, uint64_t bufferOffset, void const * data, size_t size) WGPU_FUNCTION_ATTRIBUTE; @@ -2340,9 +2430,9 @@ WGPU_EXPORT void wgpuSharedFenceReference(WGPUSharedFence sharedFence) WGPU_FUNC WGPU_EXPORT void wgpuSharedFenceRelease(WGPUSharedFence sharedFence) WGPU_FUNCTION_ATTRIBUTE; // Methods of SharedTextureMemory -WGPU_EXPORT void wgpuSharedTextureMemoryBeginAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryBeginAccessDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryBeginAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryBeginAccessDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUTexture wgpuSharedTextureMemoryCreateTexture(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; -WGPU_EXPORT void wgpuSharedTextureMemoryEndAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryEndAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryGetProperties(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemorySetLabel(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryReference(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp index 82bef2be3843..b889ed5e7385 100644 --- a/src/runtime/webgpu.cpp +++ b/src/runtime/webgpu.cpp @@ -472,7 +472,7 @@ WEAK int halide_webgpu_device_sync(void *user_context, halide_buffer_t *) { __atomic_test_and_set(&result.complete, __ATOMIC_RELAXED); wgpuQueueOnSubmittedWorkDone( - context.queue, 0, + context.queue, [](WGPUQueueWorkDoneStatus status, void *userdata) { WorkDoneResult *result = (WorkDoneResult *)userdata; result->status = status; From 2b23e07f3d5287a7b2a88ad8c2417d72c2b4dd45 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 28 Nov 2023 08:05:52 -0800 Subject: [PATCH 14/15] Return values from stub functions in Deserialization (#7963) Needed to prevent "error: non-void function does not return a value" --- src/Deserialization.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 9923e9d1c89c..cb492709904b 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1574,14 +1574,17 @@ Pipeline deserialize_pipeline(const std::vector &buffer, const std::map std::map deserialize_parameters(const std::string &filename) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; + return {}; } std::map deserialize_parameters(std::istream &in) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; + return {}; } std::map deserialize_parameters(const std::vector &buffer) { user_error << "Deserialization is not supported in this build of Halide; try rebuilding with WITH_SERIALIZATION=ON."; + return {}; } } // namespace Halide From 5175d169c6127f1aa8cc7f7589b171af47222842 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 28 Nov 2023 13:59:21 -0800 Subject: [PATCH 15/15] Make the fast inverse test throughput-limited rather than latency-limited (#7958) Co-authored-by: Steven Johnson --- test/performance/fast_inverse.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/performance/fast_inverse.cpp b/test/performance/fast_inverse.cpp index 6ddfa4f620d7..cfd9b7c796c2 100644 --- a/test/performance/fast_inverse.cpp +++ b/test/performance/fast_inverse.cpp @@ -14,6 +14,8 @@ int main(int argc, char **argv) { if (target.arch == Target::ARM && target.os == Target::OSX) { + // vrecpe, vrecps, fmul have inverse throughputs of 1, 0.25, 0.25 + // respectively, while fdiv has inverse throughput of 1. printf("[SKIP] Apple M1 chips have division performance roughly on par with the reciprocal instruction\n"); return 0; } @@ -31,13 +33,16 @@ int main(int argc, char **argv) { slow(x) = p / (slow(x) + 1) + 0 * r; fast(x) = fast_inverse((fast(x) + 1) + 0 * r); - slow.update().vectorize(x, 4); - fast.update().vectorize(x, 4); + // Use wide vectors to ensure we're throughput-limited rather than latency-limited. + const int vec = 32; + + slow.update().vectorize(x, vec); + fast.update().vectorize(x, vec); slow.compile_jit(); fast.compile_jit(); - Buffer out_fast(8), out_slow(8); + Buffer out_fast(vec), out_slow(vec); double slow_time = benchmark([&]() { slow.realize(out_slow); }); double fast_time = benchmark([&]() { fast.realize(out_fast); });