Files
llvm-project/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
apple-fcloutier d28a8b0b3f [CodeGen] Change -O0 bool load codegen to have nonzero model (#193783)
The main follow-up item to
https://github.com/llvm/llvm-project/pull/160790 was changing -O0
codegen to convert in-memory i8 bool values to i1 with the `nonzero`
rule (`icmp ne i8 %val, 0`) rather than the `truncate` rule (`trunc i8
%val to i1`).

Bool values can only be `true` or `false`. While they are notionally a
single bit, the smallest addressable unit is CHAR_BIT bits large, and
CHAR_BIT is typically 8. Programming errors (such as memcpying a random
byte to a `bool`) can cause the 8-bit storage for a `bool` value to have
a bit pattern that is different from `true` or `false`, which then leads
to undefined behavior.

Clang has historically taken advantage of this in optimized builds
(everything other than -O0) by attaching range metadata to `bool` loads
to assume that the value loaded can only be 0 or 1. This leads to
exploitable security issues, and the correct behavior is not always easy
to explain to C developers. To remedy this situation, Clang accepted a
[-fstrict-bool](https://discourse.llvm.org/t/defining-what-happens-when-a-bool-isn-t-0-or-1/86778)
switch to control whether it can assume that loaded bool values are
always necessarily 0 or 1. By default, it does (maintaining the status
quo), and users must specify `-fno-strict-bool` to opt out of that
behavior.

When opting out, users can optionally request that bool i8 values are
converted to i1 either by truncation or by comparing to 0. The default
is comparing to 0. However, since `-O0` alone _technically_ uses
-fstrict-bool, unoptimized builds convert i8 bool values to i1 with a
`trunc` operation, whereas `-O1 -fno-strict-bool` converts i8 bool
values to i1 with `icmp ne 0`. This is a surprising inconsistency.

This PR changes -O0 codegen to align with -fno-strict-bool. This is
achieved with a single-line change:

```
   bool isConvertingBoolWithCmp0() const {
     switch (getLoadBoolFromMem()) {
     case BoolFromMem::Strict:
+      return !isOptimizedBuild();
     case BoolFromMem::Truncate:
```

However, it impacts a _very large_ number of tests, so we agreed to move
it out of the -fstrict-bool PR to reduce the chances we would have to
back out the whole thing for this secondary item.

This PR does the change and modifies the tests accordingly. I expect
that it will go stale rather quickly. If this needs more discussion,
I'll only update it once we reach consensus.
2026-04-27 17:07:20 -04:00

240 lines
13 KiB
HLSL

// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROW-CHECK
// vector flat cast from array
// CHECK-LABEL: define void {{.*}}call2
// CHECK: [[A:%.*]] = alloca [2 x [1 x i32]], align 4
// CHECK-NEXT: [[B:%.*]] = alloca <2 x i32>, align 4
// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x [1 x i32]], align 4
// CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 0, i32 0
// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr [[Tmp]], i32 0, i32 1, i32 0
// CHECK-NEXT: [[C:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 4
// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
// CHECK-NEXT: [[D:%.*]] = insertelement <2 x i32> [[C]], i32 [[L]], i64 0
// CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[G2]], align 4
// CHECK-NEXT: [[E:%.*]] = insertelement <2 x i32> [[D]], i32 [[L2]], i64 1
// CHECK-NEXT: store <2 x i32> [[E]], ptr [[B]], align 4
export void call2() {
int A[2][1] = {{1},{2}};
int2 B = (int2)A;
}
struct S {
int X;
float Y;
};
// vector flat cast from struct
// CHECK-LABEL: define void {{.*}}call3
// CHECK: [[s:%.*]] = alloca %struct.S, align 1
// CHECK-NEXT: [[A:%.*]] = alloca <2 x i32>, align 4
// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 1
// CHECK-NEXT: [[Tmp2:%.*]] = alloca <2 x i32>, align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[s]], ptr align 1 {{.*}}, i32 8, i1 false)
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 [[s]], i32 8, i1 false)
// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
// CHECK-NEXT: [[B:%.*]] = load <2 x i32>, ptr [[Tmp2]], align 4
// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
// CHECK-NEXT: [[C:%.*]] = insertelement <2 x i32> [[B]], i32 [[L]], i64 0
// CHECK-NEXT: [[L2:%.*]] = load float, ptr [[G2]], align 4
// CHECK-NEXT: [[D:%.*]] = fptosi float [[L2]] to i32
// CHECK-NEXT: [[E:%.*]] = insertelement <2 x i32> [[C]], i32 [[D]], i64 1
// CHECK-NEXT: store <2 x i32> [[E]], ptr [[A]], align 4
export void call3() {
S s = {1, 2.0};
int2 A = (int2)s;
}
// truncate array to scalar
// CHECK-LABEL: define void {{.*}}call4
// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
// CHECK-NEXT: [[B:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 {{.*}}, i32 8, i1 false)
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
// CHECK-NEXT: store i32 [[L]], ptr [[B]], align 4
export void call4() {
int A[2] = {1,2};
int B = (int)A;
}
// truncate struct to scalar
// CHECK-LABEL: define void {{.*}}call5
// CHECK: [[s:%.*]] = alloca %struct.S, align 1
// CHECK-NEXT: [[A:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.S, align 1
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[s]], ptr align 1 {{.*}}, i32 8, i1 false)
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 [[s]], i32 8, i1 false)
// CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 0
// CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[Tmp]], i32 0, i32 1
// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
// CHECK-NEXT: store i32 [[L]], ptr [[A]], align 4
export void call5() {
S s = {1, 2.0};
int A = (int)s;
}
struct BFields {
double D;
int E: 15;
int : 8;
float F;
};
struct Derived : BFields {
int G;
};
// vector flat cast from derived struct with bitfield
// CHECK-LABEL: call6
// CHECK: [[A:%.*]] = alloca <4 x i32>, align 4
// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.Derived, align 1
// CHECK-NEXT: [[FlatTmp:%.*]] = alloca <4 x i32>, align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 %D, i32 19, i1 false)
// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0
// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 0
// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 2
// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 1
// CHECK-NEXT: [[Z:%.*]] = load <4 x i32>, ptr [[FlatTmp]], align 4
// CHECK-NEXT: [[Y:%.*]] = load double, ptr [[Gep1]], align 8
// CHECK-NEXT: [[C:%.*]] = fptosi double [[Y]] to i32
// CHECK-NEXT: [[X:%.*]] = insertelement <4 x i32> [[Z]], i32 [[C]], i64 0
// CHECK-NEXT: [[BFL:%.*]] = load i24, ptr [[E]], align 1
// CHECK-NEXT: [[BFShl:%.*]] = shl i24 [[BFL]], 9
// CHECK-NEXT: [[BFAshr:%.*]] = ashr i24 [[BFShl]], 9
// CHECK-NEXT: [[BFC:%.*]] = sext i24 [[BFAshr]] to i32
// CHECK-NEXT: [[W:%.*]] = insertelement <4 x i32> [[X]], i32 [[BFC]], i64 1
// CHECK-NEXT: [[V:%.*]] = load float, ptr [[Gep2]], align 4
// CHECK-NEXT: [[C4:%.*]] = fptosi float [[V]] to i32
// CHECK-NEXT: [[U:%.*]] = insertelement <4 x i32> [[W]], i32 [[C4]], i64 2
// CHECK-NEXT: [[T:%.*]] = load i32, ptr [[Gep3]], align 4
// CHECK-NEXT: [[S:%.*]] = insertelement <4 x i32> [[U]], i32 [[T]], i64 3
// CHECK-NEXT: store <4 x i32> [[S]], ptr [[A]], align 4
// CHECK-NEXT: ret void
export void call6(Derived D) {
int4 A = (int4)D;
}
// vector flat cast from matrix of same size (float)
// CHECK-LABEL: call7
// CHECK: [[M_ADDR:%.*]] = alloca [2 x <2 x float>], align 4
// CHECK-NEXT: [[V:%.*]] = alloca <4 x float>, align 4
// CHECK-NEXT: [[HLSL_EWCAST_SRC:%.*]] = alloca [2 x <2 x float>], align 4
// CHECK-NEXT: [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 4
// CHECK-NEXT: store <4 x float> %M, ptr [[M_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[M_ADDR]], align 4
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
// CHECK-NEXT: [[MATRIX_GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[HLSL_EWCAST_SRC]], i32 0
// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[FLATCAST_TMP]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float [[MATRIXEXT]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
// COL-CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
// ROW-CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float [[MATRIXEXT1]], i64 1
// CHECK-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
// COL-CHECK-NEXT: [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
// ROW-CHECK-NEXT: [[MATRIXEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
// CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float [[MATRIXEXT2]], i64 2
// CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
// CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float [[MATRIXEXT3]], i64 3
// CHECK-NEXT: store <4 x float> [[TMP9]], ptr [[V]], align 4
// CHECK-NEXT: ret void
export void call7(float2x2 M) {
float4 V = (float4)M;
}
// vector flat cast from matrix of same size (int)
// CHECK-LABEL: call8
// COL-CHECK: [[M_ADDR:%.*]] = alloca [1 x <3 x i32>], align 4
// ROW-CHECK: [[M_ADDR:%.*]] = alloca [3 x <1 x i32>], align 4
// CHECK-NEXT: [[V:%.*]] = alloca <3 x i32>, align 4
// COL-CHECK-NEXT: [[HLSL_EWCAST_SRC:%.*]] = alloca [1 x <3 x i32>], align 4
// ROW-CHECK-NEXT: [[HLSL_EWCAST_SRC:%.*]] = alloca [3 x <1 x i32>], align 4
// CHECK-NEXT: [[FLATCAST_TMP:%.*]] = alloca <3 x i32>, align 4
// CHECK-NEXT: store <3 x i32> %M, ptr [[M_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4
// CHECK-NEXT: store <3 x i32> [[TMP0]], ptr [[HLSL_EWCAST_SRC]], align 4
// CHECK-NEXT: [[MATRIX_GEP:%.*]] = getelementptr inbounds <3 x i32>, ptr [[HLSL_EWCAST_SRC]], i32 0
// CHECK-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[FLATCAST_TMP]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
// CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[MATRIXEXT]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <3 x i32> [[TMP4]], i32 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[MATRIXEXT1]], i64 1
// CHECK-NEXT: [[TMP6:%.*]] = load <3 x i32>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT2:%.*]] = extractelement <3 x i32> [[TMP6]], i32 2
// CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[MATRIXEXT2]], i64 2
// CHECK-NEXT: store <3 x i32> [[TMP7]], ptr [[V]], align 4
// CHECK-NEXT: ret void
export void call8(int3x1 M) {
int3 V = (int3)M;
}
// vector flat cast from matrix of same size (bool)
// CHECK-LABEL: call9
// COL-CHECK: [[M_ADDR:%.*]] = alloca [2 x <1 x i32>], align 4
// ROW-CHECK: [[M_ADDR:%.*]] = alloca [1 x <2 x i32>], align 4
// CHECK-NEXT: [[V:%.*]] = alloca <2 x i32>, align 4
// COL-CHECK-NEXT: [[HLSL_EWCAST_SRC:%.*]] = alloca [2 x <1 x i32>], align 4
// ROW-CHECK-NEXT: [[HLSL_EWCAST_SRC:%.*]] = alloca [1 x <2 x i32>], align 4
// CHECK-NEXT: [[FLATCAST_TMP:%.*]] = alloca <2 x i1>, align 4
// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i1> %M to <2 x i32>
// CHECK-NEXT: store <2 x i32> [[TMP0]], ptr [[M_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[M_ADDR]], align 4
// CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[HLSL_EWCAST_SRC]], align 4
// CHECK-NEXT: [[MATRIX_GEP:%.*]] = getelementptr inbounds <2 x i32>, ptr [[HLSL_EWCAST_SRC]], i32 0
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i1>, ptr [[FLATCAST_TMP]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
// CHECK-NEXT: [[LOADEDV:%.*]] = icmp ne i32 [[MATRIXEXT]], 0
// CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP2]], i1 [[LOADEDV]], i64 0
// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[MATRIX_GEP]], align 4
// CHECK-NEXT: [[MATRIXEXT1:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
// CHECK-NEXT: [[LOADEDV2:%.*]] = icmp ne i32 [[MATRIXEXT1]], 0
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i1> [[TMP4]], i1 [[LOADEDV2]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32>
// CHECK-NEXT: store <2 x i32> [[TMP7]], ptr [[V]], align 4
// CHECK-NEXT: ret void
export void call9(bool1x2 M) {
bool2 V = (bool2)M;
}
struct BoolVecStruct {
bool2 V;
};
// vector flat cast from struct containing bool vector
// CHECK-LABEL: call10
// CHECK: [[V:%.*]] = alloca <2 x i32>, align 4
// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca %struct.BoolVecStruct, align 1
// CHECK-NEXT: [[FLATCAST_TMP:%.*]] = alloca <2 x i1>, align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_TEMP]], ptr align 1 %s, i32 8, i1 false)
// CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr inbounds %struct.BoolVecStruct, ptr [[AGG_TEMP]], i32 0, i32 0
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i1>, ptr [[FLATCAST_TMP]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[VECTOR_GEP]], align 4
// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
// CHECK-NEXT: [[LOADEDV:%.*]] = icmp ne i32 [[VECEXT]], 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i1> [[TMP0]], i1 [[LOADEDV]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[VECTOR_GEP]], align 4
// CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
// CHECK-NEXT: [[LOADEDV2:%.*]] = icmp ne i32 [[VECEXT1]], 0
// CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP2]], i1 [[LOADEDV2]], i64 1
// CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i1> [[TMP4]] to <2 x i32>
// CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[V]], align 4
// CHECK-NEXT: ret void
export void call10(BoolVecStruct s) {
bool2 V = (bool2)s;
}