diff --git a/apps/bilateral_grid/CMakeLists.txt b/apps/bilateral_grid/CMakeLists.txt index fd221fd74231..769c54148017 100644 --- a/apps/bilateral_grid/CMakeLists.txt +++ b/apps/bilateral_grid/CMakeLists.txt @@ -25,7 +25,11 @@ add_halide_library(bilateral_grid_auto_schedule FROM bilateral_grid.generator GENERATOR bilateral_grid STMT bilateral_grid_auto_schedule_STMT SCHEDULE bilateral_grid_auto_schedule_SCHEDULE - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + # When target=host-cuda or host-metal, limit the GPU shared + # memory per block to avoid gpu kernel launch failure. + PARAMS autoscheduler.last_level_cache_size=20000 + ) # Main executable add_executable(bilateral_grid_process filter.cpp) diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt index 3c52c1c2a41d..2841be871816 100644 --- a/apps/local_laplacian/CMakeLists.txt +++ b/apps/local_laplacian/CMakeLists.txt @@ -20,7 +20,11 @@ add_halide_generator(local_laplacian.generator add_halide_library(local_laplacian FROM local_laplacian.generator) add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator GENERATOR local_laplacian - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + # When target=host-cuda or host-metal, limit the GPU shared + # memory per block to avoid gpu kernel launch failure. + PARAMS autoscheduler.last_level_cache_size=30000 + ) # Main executable add_executable(local_laplacian_process process.cpp) diff --git a/apps/nl_means/CMakeLists.txt b/apps/nl_means/CMakeLists.txt index 6653cfe7b8b3..16f2b587304c 100644 --- a/apps/nl_means/CMakeLists.txt +++ b/apps/nl_means/CMakeLists.txt @@ -18,7 +18,11 @@ add_halide_generator(nl_means.generator SOURCES nl_means_generator.cpp) add_halide_library(nl_means FROM nl_means.generator) add_halide_library(nl_means_auto_schedule FROM nl_means.generator GENERATOR nl_means - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + # When target=host-cuda or host-metal, limit the GPU shared + # memory per block to avoid gpu kernel launch failure. + PARAMS autoscheduler.last_level_cache_size=20000 + ) # Main executable add_executable(nl_means_process process.cpp) diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt index 2a64a719209f..c00e12f9d60b 100644 --- a/apps/stencil_chain/CMakeLists.txt +++ b/apps/stencil_chain/CMakeLists.txt @@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp add_halide_library(stencil_chain FROM stencil_chain.generator) add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator GENERATOR stencil_chain - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + # When target=host-cuda or host-metal, limit the GPU shared + # memory per block to avoid gpu kernel launch failure. + PARAMS autoscheduler.last_level_cache_size=15000 + ) # Main executable add_executable(stencil_chain_process process.cpp) diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index db3231c328c1..db118fd4ec5c 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -42,7 +42,7 @@ struct ArchParams { * CACHE_SIZE to 48 KB. */ constexpr ArchParams(bool has_gpu_feature) - : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 48 * 1024 : 16 * 1024 * 1024), + : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 35 * 1024 : 16 * 1024 * 1024), balance(has_gpu_feature ? 20 : 40) { } };