diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-call-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-call-operand.ll deleted file mode 100644 index 08a096e42d15..000000000000 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-call-operand.ll +++ /dev/null @@ -1,110 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" - -declare double @user_func(double) -declare double @another_user_func(double, double) - -; Two consecutive non-vectorizable calls feeding stores to non-consecutive -; addresses. The fmul operand chain is only reachable through the calls, so -; without operand-chain seeding from non-vectorizable calls the loads/fmuls -; remain scalar. -define void @test_two_calls_with_vectorizable_operands(ptr %p, ptr %out0, ptr %out1) { -; CHECK-LABEL: define void @test_two_calls_with_vectorizable_operands( -; CHECK-SAME: ptr [[P:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[P]], align 8 -; CHECK-NEXT: [[P1:%.*]] = getelementptr double, ptr [[P]], i64 1 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[P1]], align 8 -; CHECK-NEXT: [[M0:%.*]] = fmul double [[A0]], 2.000000e+00 -; CHECK-NEXT: [[M1:%.*]] = fmul double [[A1]], 3.000000e+00 -; CHECK-NEXT: [[R0:%.*]] = call double @user_func(double [[M0]]) -; CHECK-NEXT: [[R1:%.*]] = call double @user_func(double [[M1]]) -; CHECK-NEXT: store double [[R0]], ptr [[OUT0]], align 8 -; CHECK-NEXT: store double [[R1]], ptr [[OUT1]], align 8 -; CHECK-NEXT: ret void -; - %a0 = load double, ptr %p, align 8 - %p1 = getelementptr double, ptr %p, i64 1 - %a1 = load double, ptr %p1, align 8 - %m0 = fmul double %a0, 2.000000e+00 - %m1 = fmul double %a1, 3.000000e+00 - %r0 = call double @user_func(double %m0) - %r1 = call double @user_func(double %m1) - store double %r0, ptr %out0, align 8 - store double %r1, ptr %out1, align 8 - ret void -} - -; Two non-vectorizable calls with paired operands across calls. Both arguments -; should be vectorized as <2 x double> bundles. -define void @test_two_calls_paired_operands(ptr %p, ptr %q, ptr %out0, ptr %out1) { -; CHECK-LABEL: define void @test_two_calls_paired_operands( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[P]], align 8 -; CHECK-NEXT: [[P1:%.*]] = getelementptr double, ptr [[P]], i64 1 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[P1]], align 8 -; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[Q]], align 8 -; CHECK-NEXT: [[Q1:%.*]] = getelementptr double, ptr [[Q]], i64 1 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[Q1]], align 8 -; CHECK-NEXT: [[M0:%.*]] = fmul double [[A0]], 2.000000e+00 -; CHECK-NEXT: [[M1:%.*]] = fmul double [[A1]], 3.000000e+00 -; CHECK-NEXT: [[N0:%.*]] = fadd double [[B0]], 4.000000e+00 -; CHECK-NEXT: [[N1:%.*]] = fadd double [[B1]], 5.000000e+00 -; CHECK-NEXT: [[R0:%.*]] = call double @another_user_func(double [[M0]], double [[N0]]) -; CHECK-NEXT: [[R1:%.*]] = call double @another_user_func(double [[M1]], double [[N1]]) -; CHECK-NEXT: store double [[R0]], ptr [[OUT0]], align 8 -; CHECK-NEXT: store double [[R1]], ptr [[OUT1]], align 8 -; CHECK-NEXT: ret void -; - %a0 = load double, ptr %p, align 8 - %p1 = getelementptr double, ptr %p, i64 1 - %a1 = load double, ptr %p1, align 8 - %b0 = load double, ptr %q, align 8 - %q1 = getelementptr double, ptr %q, i64 1 - %b1 = load double, ptr %q1, align 8 - %m0 = fmul double %a0, 2.000000e+00 - %m1 = fmul double %a1, 3.000000e+00 - %n0 = fadd double %b0, 4.000000e+00 - %n1 = fadd double %b1, 5.000000e+00 - %r0 = call double @another_user_func(double %m0, double %n0) - %r1 = call double @another_user_func(double %m1, double %n1) - store double %r0, ptr %out0, align 8 - store double %r1, ptr %out1, align 8 - ret void -} - -; Memory intrinsics must not be added to the post-process call list. -define void @test_skip_memcpy(ptr %dst, ptr %src) { -; CHECK-LABEL: define void @test_skip_memcpy( -; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 16, i1 false) -; CHECK-NEXT: ret void -; - call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 16, i1 false) - ret void -} - -; Trivially vectorizable intrinsics are still handled by the regular path - -; this confirms the existing behavior is preserved. -define void @test_trivially_vectorizable_intrinsic(ptr %p, ptr %out) { -; CHECK-LABEL: define void @test_trivially_vectorizable_intrinsic( -; CHECK-SAME: ptr [[P:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[P]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[OUT]], align 8 -; CHECK-NEXT: ret void -; - %a0 = load double, ptr %p, align 8 - %p1 = getelementptr double, ptr %p, i64 1 - %a1 = load double, ptr %p1, align 8 - %r0 = call double @llvm.fabs.f64(double %a0) - %r1 = call double @llvm.fabs.f64(double %a1) - store double %r0, ptr %out, align 8 - %out1 = getelementptr double, ptr %out, i64 1 - store double %r1, ptr %out1, align 8 - ret void -} - -declare double @llvm.fabs.f64(double) -declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-inst-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-inst-operand.ll new file mode 100644 index 000000000000..dc91352d0f2b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-vectorizable-inst-operand.ll @@ -0,0 +1,282 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +declare double @user_func(double) +declare double @another_user_func(double, double) +declare i32 @__gxx_personality_v0(...) + +; Two consecutive non-vectorizable calls feeding stores to non-consecutive +; addresses. The fmul operand chain is only reachable through the calls, so +; without operand-chain seeding from non-vectorizable calls the loads/fmuls +; remain scalar. +define void @test_two_calls_with_vectorizable_operands(ptr %p, ptr %out0, ptr %out1) { +; CHECK-LABEL: define void @test_two_calls_with_vectorizable_operands( +; CHECK-SAME: ptr [[P:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[P]], align 8 +; CHECK-NEXT: [[P1:%.*]] = getelementptr double, ptr [[P]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[P1]], align 8 +; CHECK-NEXT: [[M0:%.*]] = fmul double [[A0]], 2.000000e+00 +; CHECK-NEXT: [[M1:%.*]] = fmul double [[A1]], 3.000000e+00 +; CHECK-NEXT: [[R0:%.*]] = call double @user_func(double [[M0]]) +; CHECK-NEXT: [[R1:%.*]] = call double @user_func(double [[M1]]) +; CHECK-NEXT: store double [[R0]], ptr [[OUT0]], align 8 +; CHECK-NEXT: store double [[R1]], ptr [[OUT1]], align 8 +; CHECK-NEXT: ret void +; + %a0 = load double, ptr %p, align 8 + %p1 = getelementptr double, ptr %p, i64 1 + %a1 = load double, ptr %p1, align 8 + %m0 = fmul double %a0, 2.000000e+00 + %m1 = fmul double %a1, 3.000000e+00 + %r0 = call double @user_func(double %m0) + %r1 = call double @user_func(double %m1) + store double %r0, ptr %out0, align 8 + store double %r1, ptr %out1, align 8 + ret void +} + +; Two non-vectorizable calls with paired operands across calls. Both arguments +; should be vectorized as <2 x double> bundles. +define void @test_two_calls_paired_operands(ptr %p, ptr %q, ptr %out0, ptr %out1) { +; CHECK-LABEL: define void @test_two_calls_paired_operands( +; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[P]], align 8 +; CHECK-NEXT: [[P1:%.*]] = getelementptr double, ptr [[P]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[P1]], align 8 +; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[Q]], align 8 +; CHECK-NEXT: [[Q1:%.*]] = getelementptr double, ptr [[Q]], i64 1 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[Q1]], align 8 +; CHECK-NEXT: [[M0:%.*]] = fmul double [[A0]], 2.000000e+00 +; CHECK-NEXT: [[M1:%.*]] = fmul double [[A1]], 3.000000e+00 +; CHECK-NEXT: [[N0:%.*]] = fadd double [[B0]], 4.000000e+00 +; CHECK-NEXT: [[N1:%.*]] = fadd double [[B1]], 5.000000e+00 +; CHECK-NEXT: [[R0:%.*]] = call double @another_user_func(double [[M0]], double [[N0]]) +; CHECK-NEXT: [[R1:%.*]] = call double @another_user_func(double [[M1]], double [[N1]]) +; CHECK-NEXT: store double [[R0]], ptr [[OUT0]], align 8 +; CHECK-NEXT: store double [[R1]], ptr [[OUT1]], align 8 +; CHECK-NEXT: ret void +; + %a0 = load double, ptr %p, align 8 + %p1 = getelementptr double, ptr %p, i64 1 + %a1 = load double, ptr %p1, align 8 + %b0 = load double, ptr %q, align 8 + %q1 = getelementptr double, ptr %q, i64 1 + %b1 = load double, ptr %q1, align 8 + %m0 = fmul double %a0, 2.000000e+00 + %m1 = fmul double %a1, 3.000000e+00 + %n0 = fadd double %b0, 4.000000e+00 + %n1 = fadd double %b1, 5.000000e+00 + %r0 = call double @another_user_func(double %m0, double %n0) + %r1 = call double @another_user_func(double %m1, double %n1) + store double %r0, ptr %out0, align 8 + store double %r1, ptr %out1, align 8 + ret void +} + +; Memory intrinsics must not be added to the post-process call list. +define void @test_skip_memcpy(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @test_skip_memcpy( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 16, i1 false) + ret void +} + +; Trivially vectorizable intrinsics are still handled by the regular path - +; this confirms the existing behavior is preserved. +define void @test_trivially_vectorizable_intrinsic(ptr %p, ptr %out) { +; CHECK-LABEL: define void @test_trivially_vectorizable_intrinsic( +; CHECK-SAME: ptr [[P:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[P]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[OUT]], align 8 +; CHECK-NEXT: ret void +; + %a0 = load double, ptr %p, align 8 + %p1 = getelementptr double, ptr %p, i64 1 + %a1 = load double, ptr %p1, align 8 + %r0 = call double @llvm.fabs.f64(double %a0) + %r1 = call double @llvm.fabs.f64(double %a1) + store double %r0, ptr %out, align 8 + %out1 = getelementptr double, ptr %out, i64 1 + store double %r1, ptr %out1, align 8 + ret void +} + +define void @test_two_invokes_with_vectorizable_operands(ptr %p, ptr %out0, ptr %out1) personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: define void @test_two_invokes_with_vectorizable_operands( +; CHECK-SAME: ptr [[P:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] personality ptr @__gxx_personality_v0 { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[P]], align 8 +; CHECK-NEXT: [[P1:%.*]] = getelementptr double, ptr [[P]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[P1]], align 8 +; CHECK-NEXT: [[M0:%.*]] = fmul double [[A0]], 2.000000e+00 +; CHECK-NEXT: [[M1:%.*]] = fmul double [[A1]], 3.000000e+00 +; CHECK-NEXT: [[R0:%.*]] = invoke double @user_func(double [[M0]]) +; CHECK-NEXT: to label %[[CONT1:.*]] unwind label %[[EH:.*]] +; CHECK: [[CONT1]]: +; CHECK-NEXT: [[R1:%.*]] = invoke double @user_func(double [[M1]]) +; CHECK-NEXT: to label %[[CONT2:.*]] unwind label %[[EH]] +; CHECK: [[CONT2]]: +; CHECK-NEXT: store double [[R0]], ptr [[OUT0]], align 8 +; CHECK-NEXT: store double [[R1]], ptr [[OUT1]], align 8 +; CHECK-NEXT: ret void +; CHECK: [[EH]]: +; CHECK-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: resume { ptr, i32 } [[LP]] +; +entry: + %a0 = load double, ptr %p, align 8 + %p1 = getelementptr double, ptr %p, i64 1 + %a1 = load double, ptr %p1, align 8 + %m0 = fmul double %a0, 2.000000e+00 + %m1 = fmul double %a1, 3.000000e+00 + %r0 = invoke double @user_func(double %m0) to label %cont1 unwind label %eh +cont1: + %r1 = invoke double @user_func(double %m1) to label %cont2 unwind label %eh +cont2: + store double %r0, ptr %out0, align 8 + store double %r1, ptr %out1, align 8 + ret void +eh: + %lp = landingpad { ptr, i32 } cleanup + resume { ptr, i32 } %lp +} + +define void @test_atomicrmw_value_chain(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %src) { +; CHECK-LABEL: define void @test_atomicrmw_value_chain( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P3:%.*]], ptr [[P4:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[SP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr [[SP1]], align 4 +; CHECK-NEXT: [[SP2:%.*]] = getelementptr i32, ptr [[SRC]], i64 2 +; CHECK-NEXT: [[A2:%.*]] = load i32, ptr [[SP2]], align 4 +; CHECK-NEXT: [[SP3:%.*]] = getelementptr i32, ptr [[SRC]], i64 3 +; CHECK-NEXT: [[A3:%.*]] = load i32, ptr [[SP3]], align 4 +; CHECK-NEXT: [[V0:%.*]] = mul i32 [[A0]], 7 +; CHECK-NEXT: [[V1:%.*]] = mul i32 [[A1]], 7 +; CHECK-NEXT: [[V2:%.*]] = mul i32 [[A2]], 7 +; CHECK-NEXT: [[V3:%.*]] = mul i32 [[A3]], 7 +; CHECK-NEXT: [[OLD0:%.*]] = atomicrmw add ptr [[P1]], i32 [[V0]] seq_cst, align 4 +; CHECK-NEXT: [[OLD1:%.*]] = atomicrmw add ptr [[P2]], i32 [[V1]] seq_cst, align 4 +; CHECK-NEXT: [[OLD2:%.*]] = atomicrmw add ptr [[P3]], i32 [[V2]] seq_cst, align 4 +; CHECK-NEXT: [[OLD3:%.*]] = atomicrmw add ptr [[P4]], i32 [[V3]] seq_cst, align 4 +; CHECK-NEXT: ret void +; + %a0 = load i32, ptr %src, align 4 + %sp1 = getelementptr i32, ptr %src, i64 1 + %a1 = load i32, ptr %sp1, align 4 + %sp2 = getelementptr i32, ptr %src, i64 2 + %a2 = load i32, ptr %sp2, align 4 + %sp3 = getelementptr i32, ptr %src, i64 3 + %a3 = load i32, ptr %sp3, align 4 + %v0 = mul i32 %a0, 7 + %v1 = mul i32 %a1, 7 + %v2 = mul i32 %a2, 7 + %v3 = mul i32 %a3, 7 + %old0 = atomicrmw add ptr %p1, i32 %v0 seq_cst + %old1 = atomicrmw add ptr %p2, i32 %v1 seq_cst + %old2 = atomicrmw add ptr %p3, i32 %v2 seq_cst + %old3 = atomicrmw add ptr %p4, i32 %v3 seq_cst + ret void +} + +define void @test_cmpxchg_value_chain(ptr %p1, ptr %p2, ptr %src_a, ptr %src_b) { +; CHECK-LABEL: define void @test_cmpxchg_value_chain( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[SRC_A:%.*]], ptr [[SRC_B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr [[SRC_A]], align 4 +; CHECK-NEXT: [[AP1:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr [[AP1]], align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr [[SRC_B]], align 4 +; CHECK-NEXT: [[BP1:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 1 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr [[BP1]], align 4 +; CHECK-NEXT: [[CMP0:%.*]] = mul i32 [[A0]], 11 +; CHECK-NEXT: [[CMP1:%.*]] = mul i32 [[A1]], 11 +; CHECK-NEXT: [[NEW0:%.*]] = mul i32 [[B0]], 13 +; CHECK-NEXT: [[NEW1:%.*]] = mul i32 [[B1]], 13 +; CHECK-NEXT: [[R0:%.*]] = cmpxchg ptr [[P1]], i32 [[CMP0]], i32 [[NEW0]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[R1:%.*]] = cmpxchg ptr [[P2]], i32 [[CMP1]], i32 [[NEW1]] seq_cst seq_cst, align 4 +; CHECK-NEXT: ret void +; + %a0 = load i32, ptr %src_a, align 4 + %ap1 = getelementptr i32, ptr %src_a, i64 1 + %a1 = load i32, ptr %ap1, align 4 + %b0 = load i32, ptr %src_b, align 4 + %bp1 = getelementptr i32, ptr %src_b, i64 1 + %b1 = load i32, ptr %bp1, align 4 + %cmp0 = mul i32 %a0, 11 + %cmp1 = mul i32 %a1, 11 + %new0 = mul i32 %b0, 13 + %new1 = mul i32 %b1, 13 + %r0 = cmpxchg ptr %p1, i32 %cmp0, i32 %new0 seq_cst seq_cst + %r1 = cmpxchg ptr %p2, i32 %cmp1, i32 %new1 seq_cst seq_cst + ret void +} + +define i32 @test_return_operand_chain(i1 %cond, i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @test_return_operand_chain( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[A:.*]], label %[[B:.*]] +; CHECK: [[A]]: +; CHECK-NEXT: [[M0:%.*]] = mul i32 [[X]], 5 +; CHECK-NEXT: ret i32 [[M0]] +; CHECK: [[B]]: +; CHECK-NEXT: [[M1:%.*]] = mul i32 [[Y]], 5 +; CHECK-NEXT: ret i32 [[M1]] +; +entry: + br i1 %cond, label %a, label %b +a: + %m0 = mul i32 %x, 5 + ret i32 %m0 +b: + %m1 = mul i32 %y, 5 + ret i32 %m1 +} + +define void @test_scattered_stores(ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %src) { +; CHECK-LABEL: define void @test_scattered_stores( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P3:%.*]], ptr [[P4:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[A0:%.*]] = load double, ptr [[SRC]], align 8 +; CHECK-NEXT: [[SP1:%.*]] = getelementptr double, ptr [[SRC]], i64 1 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[SP1]], align 8 +; CHECK-NEXT: [[SP2:%.*]] = getelementptr double, ptr [[SRC]], i64 2 +; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[SP2]], align 8 +; CHECK-NEXT: [[SP3:%.*]] = getelementptr double, ptr [[SRC]], i64 3 +; CHECK-NEXT: [[A3:%.*]] = load double, ptr [[SP3]], align 8 +; CHECK-NEXT: [[V0:%.*]] = fmul double [[A0]], 2.000000e+00 +; CHECK-NEXT: [[V1:%.*]] = fmul double [[A1]], 3.000000e+00 +; CHECK-NEXT: [[V2:%.*]] = fmul double [[A2]], 5.000000e+00 +; CHECK-NEXT: [[V3:%.*]] = fmul double [[A3]], 7.000000e+00 +; CHECK-NEXT: store double [[V0]], ptr [[P1]], align 8 +; CHECK-NEXT: store double [[V1]], ptr [[P2]], align 8 +; CHECK-NEXT: store double [[V2]], ptr [[P3]], align 8 +; CHECK-NEXT: store double [[V3]], ptr [[P4]], align 8 +; CHECK-NEXT: ret void +; + %a0 = load double, ptr %src, align 8 + %sp1 = getelementptr double, ptr %src, i64 1 + %a1 = load double, ptr %sp1, align 8 + %sp2 = getelementptr double, ptr %src, i64 2 + %a2 = load double, ptr %sp2, align 8 + %sp3 = getelementptr double, ptr %src, i64 3 + %a3 = load double, ptr %sp3, align 8 + %v0 = fmul double %a0, 2.000000e+00 + %v1 = fmul double %a1, 3.000000e+00 + %v2 = fmul double %a2, 5.000000e+00 + %v3 = fmul double %a3, 7.000000e+00 + store double %v0, ptr %p1, align 8 + store double %v1, ptr %p2, align 8 + store double %v2, ptr %p3, align 8 + store double %v3, ptr %p4, align 8 + ret void +} + +declare double @llvm.fabs.f64(double) +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)