Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vulkan: improve im2col and RDNA1 performance #11826

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1423,6 +1423,49 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
return supported;
}

struct GpuPipelineConfig {
// List of all aliases for a given GPU.
// For example, this can include names like "NAVI10", "RX 5700", etc.
std::vector<std::string> device_names;

// Mapping of pipeline names to their specific subgroup sizes.
// Example: {"soft_max_f32", 64}.
std::unordered_map<std::string, uint32_t> pipelines;

// Default subgroup size for this GPU.
// Defaults to 0 if not explicitly provided.
uint32_t default_subgroup_size = 0;
};

// Define configurations for different GPUs.
static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
{
{"NAVI10", "NAVI14", "RX 5700", "RX 5600", "RX 5500"},
{
{"soft_max_f32", 64}, {"soft_max_f32_wg512", 64},
{"soft_max_f32_f16", 64}, {"soft_max_f32_f16_wg512", 64},
{"im2col_f32", 64}, {"im2col_f32_f16", 64},
},
32
},
};

static uint32_t get_subgroup_size(const std::string &pipeline_name, const std::string &device_name) {
for (const auto &config : gpu_pipeline_configs) {
for (const auto &alias : config.device_names) {
if (device_name.find(alias) != std::string::npos) {
auto pipIt = config.pipelines.find(pipeline_name);
if (pipIt != config.pipelines.end() && pipIt->second != 0) {
return pipIt->second;
}
return config.default_subgroup_size;
}
}
}
// If no matching configuration is found, return 0.
return 0;
}

static void ggml_vk_load_shaders(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

Expand Down Expand Up @@ -1543,11 +1586,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
}

vk::PhysicalDeviceProperties2 props2;
device->physical_device.getProperties2(&props2);
std::string device_name = props2.properties.deviceName.data();

std::vector<std::future<void>> compiles;
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {

required_subgroup_size = get_subgroup_size(name, device_name);

if (!pipeline) {
pipeline = std::make_shared<vk_pipeline_struct>();
pipeline->name = name;
Expand Down
49 changes: 31 additions & 18 deletions ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ void main() {
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;

const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
const int oh_s1 = int(oh) * p.s1;
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);

const uint base_linear_idx = gidx * NUM_ITER;

const uint max_ky = ksize / p.OW;

uint current_kx = base_linear_idx / ksize;
const uint rem = base_linear_idx - (current_kx * ksize);
uint current_ky = rem / p.OW;
uint current_ix = rem % p.OW;

A_TYPE values[NUM_ITER];
uint offset_dst[NUM_ITER];
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
Expand All @@ -48,36 +62,35 @@ void main() {

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
if (linear_idx >= p.pelements) {
continue;
}

const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;

offset_dst[idx] =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;

if (i >= p.pelements) {
continue;
if ((iih < p.IH) && (iiw < p.IW)) {
values[idx] = data_a[src_base + iih * p.IW + iiw];
}

if (iih < p.IH && iiw < p.IW) {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
values[idx] = data_a[offset_src + iih * p.IW + iiw];
if (++current_ix == p.OW) {
current_ix = 0;
if (++current_ky == max_ky) {
current_ky = 0;
current_kx++;
}
}
}

[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {

const uint i = gidx * NUM_ITER + idx;
const uint linear_idx = base_linear_idx + idx;

if (i >= p.pelements) {
if (linear_idx >= p.pelements) {
continue;
}

Expand Down
Loading