diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp index b3586b45a23f..b1864c3dc9ee 100644 --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -22,9 +22,9 @@ static bool isExpandableUser(User *U) { return isa(U) || isa(U); } -static SmallVector expandUser(BasicBlock::iterator InsertPt, - Constant *C) { - SmallVector NewInsts; +static void expandUser(BasicBlock::iterator InsertPt, Constant *C, + SmallVector &NewInsts) { + NewInsts.clear(); if (auto *CE = dyn_cast(C)) { Instruction *ConstInst = CE->getAsInstruction(); ConstInst->insertBefore(*InsertPt->getParent(), InsertPt); @@ -46,7 +46,6 @@ static SmallVector expandUser(BasicBlock::iterator InsertPt, } else { llvm_unreachable("Not an expandable user"); } - return NewInsts; } bool llvm::convertUsersOfConstantsToInstructions(ArrayRef Consts, @@ -91,6 +90,11 @@ bool llvm::convertUsersOfConstantsToInstructions(ArrayRef Consts, // Replace those expandable operands with instructions bool Changed = false; + // We need to cache the instructions we've already expanded to avoid expanding + // the same constant multiple times in the same basic block, which is + // problematic when the same constant is used in a phi node multiple times. + DenseMap, SmallVector> + ConstantToInstructionMap; while (!InstructionWorklist.empty()) { Instruction *I = InstructionWorklist.pop_back_val(); DebugLoc Loc = I->getDebugLoc(); @@ -105,7 +109,14 @@ bool llvm::convertUsersOfConstantsToInstructions(ArrayRef Consts, if (auto *C = dyn_cast(U.get())) { if (ExpandableUsers.contains(C)) { Changed = true; - auto NewInsts = expandUser(BI, C); + SmallVector &NewInsts = + ConstantToInstructionMap[std::make_pair(C, BI->getParent())]; + // If the cached instruction is after the insertion point, we need to + // create a new one. We can't simply move the cached instruction + // because its operands (also expanded instructions) might not + // dominate the new position. + if (NewInsts.empty() || BI->comesBefore(NewInsts.front())) + expandUser(BI, C, NewInsts); for (auto *NI : NewInsts) NI->setDebugLoc(Loc); InstructionWorklist.insert_range(NewInsts); diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll index 4fef9624d8ad..459615139d74 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -14,13 +14,13 @@ ; Use constant from different kernels ;. -; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 2 -; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 2 -; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 4 -; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16 -; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 2 -; CHECK: @llvm.amdgcn.kernel.k5.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k5.lds.t poison, align 16 -; CHECK: @llvm.amdgcn.kernel.k6.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k6.lds.t poison, align 16 +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 4, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k5.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k5.lds.t poison, align 16, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.kernel.k6.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k6.lds.t poison, align 16, !absolute_symbol !0 ;. define amdgpu_kernel void @k0(i64 %x) { ; CHECK-LABEL: @k0( @@ -67,7 +67,7 @@ define amdgpu_kernel void @k3(i64 %x) { ; CHECK-LABEL: @k3( ; CHECK-NEXT: %1 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 16 ; CHECK-NEXT: %ptr1 = addrspacecast ptr addrspace(3) %1 to ptr -; CHECK-NEXT: store i64 1, ptr %ptr1, align 1 +; CHECK-NEXT: store i64 1, ptr %ptr1, align 16 ; CHECK-NEXT: %2 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 24 ; CHECK-NEXT: %ptr2 = addrspacecast ptr addrspace(3) %2 to ptr ; CHECK-NEXT: store i64 2, ptr %ptr2, align 8 @@ -98,9 +98,9 @@ define amdgpu_kernel void @k4(i64 %x) { ; Multiple constexpr use in a same instruction. define amdgpu_kernel void @k5() { ; CHECK-LABEL: @k5( -; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr -; CHECK-NEXT: %2 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr -; CHECK-NEXT: call void poison(ptr %1, ptr %2) +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr +; CHECK-NEXT: call void poison(ptr %1, ptr %1) +; CHECK-NEXT: ret void ; call void poison(ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr), ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr)) ret void @@ -113,13 +113,22 @@ define amdgpu_kernel void @k5() { ; expression operands of store should be replaced by equivalent instruction sequences. define amdgpu_kernel void @k6() { ; CHECK-LABEL: @k6( - -; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 -; CHECK-NEXT: %2 = ptrtoint ptr addrspace(3) %1 to i32 -; CHECK-NEXT: %3 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 -; CHECK-NEXT: store i32 %2, ptr addrspace(3) %3, align 8 -; CHECK-NEXT: ret void +; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 +; CHECK-NEXT: %2 = ptrtoint ptr addrspace(3) %1 to i32 +; CHECK-NEXT: %3 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2 +; CHECK-NEXT: store i32 %2, ptr addrspace(3) %3, align 8 +; CHECK-NEXT: ret void ; + store i32 ptrtoint (ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2) to i32), ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2) ret void } +;. +; CHECK: attributes #0 = { "amdgpu-lds-size"="2" } +; CHECK: attributes #1 = { "amdgpu-lds-size"="4" } +; CHECK: attributes #2 = { "amdgpu-lds-size"="32" } +; CHECK: attributes #3 = { "amdgpu-lds-size"="2020" } +; CHECK: attributes #4 = { "amdgpu-lds-size"="16" } +;. +; CHECK: !0 = !{i32 0, i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll index a2761193c2d6..deb2d00e8bd8 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s @@ -9,73 +10,78 @@ @kern = addrspace(3) global float poison, align 4 ; @a_func is only used from a non-kernel function so is rewritten -; CHECK-NOT: @a_func ; @b_both is used from a non-kernel function so is rewritten -; CHECK-NOT: @b_both ; sorted both < func, so @b_both at null and @a_func at 4 @b_both = addrspace(3) global float poison, align 4 -; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4 -; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t poison, align 4 -; CHECK-LABEL: @get_func() -; CHECK: %0 = addrspacecast ptr addrspace(3) @llvm.amdgcn.module.lds to ptr -; CHECK: %1 = ptrtoint ptr %0 to i64 -; CHECK: %2 = addrspacecast ptr addrspace(3) @llvm.amdgcn.module.lds to ptr -; CHECK: %3 = ptrtoint ptr %2 to i64 -; CHECK: %4 = add i64 %1, %3 -; CHECK: %5 = inttoptr i64 %4 to ptr -; CHECK: %6 = load i32, ptr %5, align 4 -; CHECK: ret i32 %6 define i32 @get_func() local_unnamed_addr #0 { +; CHECK-LABEL: define i32 @get_func() local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.module.lds to ptr +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-NEXT: ret i32 [[TMP4]] +; entry: %0 = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @a_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @a_func to ptr) to i64)) to ptr), align 4 ret i32 %0 } -; CHECK-LABEL: @set_func(i32 %x) -; CHECK: %0 = addrspacecast ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1) to ptr -; CHECK: %1 = ptrtoint ptr %0 to i64 -; CHECK: %2 = addrspacecast ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1) to ptr -; CHECK: %3 = ptrtoint ptr %2 to i64 -; CHECK: %4 = add i64 %1, %3 -; CHECK: %5 = inttoptr i64 %4 to ptr -; CHECK: store i32 %x, ptr %5, align 4 -; CHECK: ret void define void @set_func(i32 %x) { +; CHECK-LABEL: define void @set_func( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1) to ptr +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +; CHECK-NEXT: store i32 [[X]], ptr [[TMP3]], align 4 +; CHECK-NEXT: ret void +; entry: store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4 ret void } -; CHECK-LABEL: @timestwo() #0 -; CHECK-NOT: call void @llvm.donothing() -; CHECK: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr -; CHECK: %2 = ptrtoint ptr %1 to i64 -; CHECK: %3 = addrspacecast ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 1) to ptr -; CHECK: %4 = ptrtoint ptr %3 to i64 -; CHECK: %5 = add i64 %2, %4 -; CHECK: %6 = inttoptr i64 %5 to ptr -; CHECK: %ld = load i32, ptr %6, align 4 -; CHECK: %mul = mul i32 %ld, 2 -; CHECK: %7 = addrspacecast ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 1) to ptr -; CHECK: %8 = ptrtoint ptr %7 to i64 -; CHECK: %9 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr -; CHECK: %10 = ptrtoint ptr %9 to i64 -; CHECK: %11 = add i64 %8, %10 -; CHECK: %12 = inttoptr i64 %11 to ptr -; CHECK: store i32 %mul, ptr %12, align 4 -; CHECK: ret void define amdgpu_kernel void @timestwo() { +; CHECK-LABEL: define amdgpu_kernel void @timestwo( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TIMESTWO_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 1) to ptr +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TIMESTWO_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 1) to ptr +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: store i32 [[MUL]], ptr [[TMP12]], align 4 +; CHECK-NEXT: ret void +; %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64)) to ptr), align 4 %mul = mul i32 %ld, 2 store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4 ret void } -; CHECK-LABEL: @through_functions() #0 define amdgpu_kernel void @through_functions() { +; CHECK-LABEL: define amdgpu_kernel void @through_functions( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: [[LD:%.*]] = call i32 @get_func() +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4 +; CHECK-NEXT: call void @set_func(i32 [[MUL]]) +; CHECK-NEXT: ret void +; %ld = call i32 @get_func() %mul = mul i32 %ld, 4 call void @set_func(i32 %mul) diff --git a/llvm/test/CodeGen/AMDGPU/same-lds-variable-multiple-use-in-one-phi-node.ll b/llvm/test/CodeGen/AMDGPU/same-lds-variable-multiple-use-in-one-phi-node.ll new file mode 100644 index 000000000000..35a9bee03411 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/same-lds-variable-multiple-use-in-one-phi-node.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-module-lds %s -o - | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-module-lds %s -o - | FileCheck %s + +@lds = internal unnamed_addr addrspace(3) global [6144 x half] poison, align 2 + +define amdgpu_kernel void @test(ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @test( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: switch i32 0, label %[[BB_3:.*]] [ +; CHECK-NEXT: i32 18, label %[[BB_2:.*]] +; CHECK-NEXT: i32 1, label %[[BB_2]] +; CHECK-NEXT: i32 0, label %[[BB_3]] +; CHECK-NEXT: ] +; CHECK: [[BB_1:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.test.lds to ptr +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: switch i32 0, label %[[BB_3]] [ +; CHECK-NEXT: i32 18, label %[[BB_2]] +; CHECK-NEXT: i32 1, label %[[BB_2]] +; CHECK-NEXT: i32 0, label %[[BB_3]] +; CHECK-NEXT: ] +; CHECK: [[BB_2]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[TMP1]], %[[BB_1]] ], [ [[TMP1]], %[[BB_1]] ], [ 10, %[[ENTRY]] ], [ 10, %[[ENTRY]] ] +; CHECK-NEXT: store i64 [[PHI]], ptr addrspace(1) [[OUT]], align 8 +; CHECK-NEXT: br label %[[BB_3]] +; CHECK: [[BB_3]]: +; CHECK-NEXT: ret void +; +entry: + switch i32 0, label %bb.3 [ + i32 18, label %bb.2 + i32 1, label %bb.2 + i32 0, label %bb.3 + ] +bb.1: + switch i32 0, label %bb.3 [ + i32 18, label %bb.2 + i32 1, label %bb.2 + i32 0, label %bb.3 + ] + +bb.2: + %phi = phi i64 [ ptrtoint (ptr addrspacecast (ptr addrspace(3) @lds to ptr) to i64), %bb.1 ], [ ptrtoint (ptr addrspacecast (ptr addrspace(3) @lds to ptr) to i64), %bb.1 ], [10, %entry], [10, %entry] + store i64 %phi, ptr addrspace(1) %out, align 8 + br label %bb.3 + +bb.3: + ret void +}