[libclc] Refine generic __clc_get_sub_group_size with fast full sub-group path (#188895)

Add a fast path for the common case that total work-group size is
multiple of max sub-group size.

The fallback path is ported from amdgpu/workitem/clc_get_sub_group_size.cl.

Compiler can generate predicated instructions for the fallback path to
avoid branches.
This commit is contained in:
Wenju He
2026-04-13 08:16:06 +08:00
committed by GitHub
parent 00328f10ac
commit 7b94b9ae13

View File

@@ -6,21 +6,21 @@
//
//===----------------------------------------------------------------------===//
#include "clc/shared/clc_min.h"
#include "clc/workitem/clc_get_local_linear_id.h"
#include "clc/workitem/clc_get_local_size.h"
#include "clc/workitem/clc_get_max_sub_group_size.h"
#include "clc/workitem/clc_get_num_sub_groups.h"
#include "clc/workitem/clc_get_sub_group_id.h"
#include "clc/workitem/clc_get_sub_group_size.h"
_CLC_OVERLOAD _CLC_DEF uint __clc_get_sub_group_size() {
if (__clc_get_sub_group_id() != __clc_get_num_sub_groups() - 1) {
return __clc_get_max_sub_group_size();
}
size_t size_x = __clc_get_local_size(0);
size_t size_y = __clc_get_local_size(1);
size_t size_z = __clc_get_local_size(2);
size_t linear_size = size_z * size_y * size_x;
size_t uniform_groups = __clc_get_num_sub_groups() - 1;
size_t uniform_size = __clc_get_max_sub_group_size() * uniform_groups;
return linear_size - uniform_size;
uint local_linear_size = (uint)__clc_get_local_size(0) *
(uint)__clc_get_local_size(1) *
(uint)__clc_get_local_size(2);
uint max_sg_size = __clc_get_max_sub_group_size();
// Assume max_sg_size is power of 2.
uint remainder = local_linear_size & (max_sg_size - 1);
if (remainder == 0)
return max_sg_size;
uint lid = (uint)__clc_get_local_linear_id();
return __clc_min(max_sg_size, local_linear_size - (lid & ~(max_sg_size - 1)));
}