From 8991ce9cff7b4e1b72c19e202b7bfe3d36499aba Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 2 Apr 2026 17:41:34 +0100 Subject: [PATCH] [AMDGPU] Add basic clmul test coverage (#190205) --- llvm/test/CodeGen/AMDGPU/clmul.ll | 4371 +++++++++++++++++++++++++++++ 1 file changed, 4371 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/clmul.ll diff --git a/llvm/test/CodeGen/AMDGPU/clmul.ll b/llvm/test/CodeGen/AMDGPU/clmul.ll new file mode 100644 index 000000000000..03cf3da9bde1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/clmul.ll @@ -0,0 +1,4371 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s + +define amdgpu_kernel void @test_clmul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_clmul_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b32 s6, s5, 2 +; SI-NEXT: s_and_b32 s7, s5, 1 +; SI-NEXT: s_and_b32 s8, s5, 4 +; SI-NEXT: s_mul_i32 s6, s4, s6 +; SI-NEXT: s_mul_i32 s7, s4, s7 +; SI-NEXT: s_and_b32 s9, s5, 8 +; SI-NEXT: s_mul_i32 s8, s4, s8 +; SI-NEXT: s_xor_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s10, s5, 16 +; SI-NEXT: s_mul_i32 s9, s4, s9 +; SI-NEXT: s_xor_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s11, s5, 32 +; SI-NEXT: s_mul_i32 s10, s4, s10 +; SI-NEXT: s_xor_b32 s6, s6, s9 +; SI-NEXT: s_and_b32 s12, s5, 64 +; SI-NEXT: s_mul_i32 s11, s4, s11 +; SI-NEXT: s_xor_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s13, s5, 0x80 +; SI-NEXT: s_mul_i32 s12, s4, s12 +; SI-NEXT: s_xor_b32 s6, s6, s11 +; SI-NEXT: s_and_b32 s14, s5, 0x100 +; SI-NEXT: s_mul_i32 s13, s4, s13 +; SI-NEXT: s_xor_b32 s6, s6, s12 +; SI-NEXT: s_and_b32 s15, s5, 0x200 +; SI-NEXT: s_mul_i32 s14, s4, s14 +; SI-NEXT: s_xor_b32 s6, s6, s13 +; SI-NEXT: s_and_b32 s16, s5, 0x400 +; SI-NEXT: s_mul_i32 s15, s4, s15 +; SI-NEXT: s_xor_b32 s6, s6, s14 +; SI-NEXT: s_and_b32 s17, s5, 0x800 +; SI-NEXT: s_mul_i32 s16, s4, s16 +; SI-NEXT: s_xor_b32 s6, s6, s15 +; SI-NEXT: s_and_b32 s18, s5, 0x1000 +; SI-NEXT: s_mul_i32 s17, s4, s17 +; SI-NEXT: s_xor_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s19, s5, 0x2000 +; SI-NEXT: s_mul_i32 s18, s4, s18 +; SI-NEXT: s_xor_b32 s6, s6, s17 +; SI-NEXT: s_and_b32 s20, s5, 0x4000 +; SI-NEXT: s_mul_i32 s19, s4, s19 +; SI-NEXT: s_xor_b32 s6, s6, s18 +; SI-NEXT: s_and_b32 s21, s5, 0x8000 +; SI-NEXT: s_mul_i32 s20, s4, s20 +; SI-NEXT: s_xor_b32 s6, s6, s19 +; SI-NEXT: s_and_b32 s22, s5, 0x10000 +; SI-NEXT: s_mul_i32 s21, s4, s21 +; SI-NEXT: s_xor_b32 s6, s6, s20 +; SI-NEXT: s_and_b32 s23, s5, 0x20000 +; SI-NEXT: s_mul_i32 s22, s4, s22 +; SI-NEXT: s_xor_b32 s6, s6, s21 +; SI-NEXT: s_and_b32 s24, s5, 0x40000 +; SI-NEXT: s_mul_i32 s23, s4, s23 +; SI-NEXT: s_xor_b32 s6, s6, s22 +; SI-NEXT: s_and_b32 s25, s5, 0x80000 +; SI-NEXT: s_mul_i32 s24, s4, s24 +; SI-NEXT: s_xor_b32 s6, s6, s23 +; SI-NEXT: s_and_b32 s26, s5, 0x100000 +; SI-NEXT: s_mul_i32 s25, s4, s25 +; SI-NEXT: s_xor_b32 s6, s6, s24 +; SI-NEXT: s_and_b32 s27, s5, 0x200000 +; SI-NEXT: s_mul_i32 s26, s4, s26 +; SI-NEXT: s_xor_b32 s6, s6, s25 +; SI-NEXT: s_and_b32 s28, s5, 0x400000 +; SI-NEXT: s_mul_i32 s27, s4, s27 +; SI-NEXT: s_xor_b32 s6, s6, s26 +; SI-NEXT: s_and_b32 s29, s5, 0x800000 +; SI-NEXT: s_mul_i32 s28, s4, s28 +; SI-NEXT: s_xor_b32 s6, s6, s27 +; SI-NEXT: s_and_b32 s30, s5, 0x1000000 +; SI-NEXT: s_mul_i32 s29, s4, s29 +; SI-NEXT: s_xor_b32 s6, s6, s28 +; SI-NEXT: s_and_b32 s31, s5, 0x2000000 +; SI-NEXT: s_mul_i32 s30, s4, s30 +; SI-NEXT: s_xor_b32 s6, s6, s29 +; SI-NEXT: s_and_b32 s33, s5, 0x4000000 +; SI-NEXT: s_mul_i32 s31, s4, s31 +; SI-NEXT: s_xor_b32 s6, s6, s30 +; SI-NEXT: s_and_b32 s34, s5, 0x8000000 +; SI-NEXT: s_mul_i32 s33, s4, s33 +; SI-NEXT: s_xor_b32 s6, s6, s31 +; SI-NEXT: s_and_b32 s35, s5, 0x10000000 +; SI-NEXT: s_mul_i32 s34, s4, s34 +; SI-NEXT: s_xor_b32 s6, s6, s33 +; SI-NEXT: s_and_b32 s36, s5, 0x20000000 +; SI-NEXT: s_mul_i32 s35, s4, s35 +; SI-NEXT: s_xor_b32 s6, s6, s34 +; SI-NEXT: s_and_b32 s37, s5, 2.0 +; SI-NEXT: s_mul_i32 s36, s4, s36 +; SI-NEXT: s_xor_b32 s6, s6, s35 +; SI-NEXT: s_and_b32 s5, s5, 0x80000000 +; SI-NEXT: s_mul_i32 s37, s4, s37 +; SI-NEXT: s_xor_b32 s6, s6, s36 +; SI-NEXT: s_xor_b32 s6, s6, s37 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_xor_b32 s4, s6, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_clmul_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s5, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: s_and_b32 s6, s5, 2 +; VI-NEXT: s_and_b32 s7, s5, 1 +; VI-NEXT: s_and_b32 s8, s5, 4 +; VI-NEXT: s_mul_i32 s6, s4, s6 +; VI-NEXT: s_mul_i32 s7, s4, s7 +; VI-NEXT: s_and_b32 s9, s5, 8 +; VI-NEXT: s_mul_i32 s8, s4, s8 +; VI-NEXT: s_xor_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s10, s5, 16 +; VI-NEXT: s_mul_i32 s9, s4, s9 +; VI-NEXT: s_xor_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s11, s5, 32 +; VI-NEXT: s_mul_i32 s10, s4, s10 +; VI-NEXT: s_xor_b32 s6, s6, s9 +; VI-NEXT: s_and_b32 s12, s5, 64 +; VI-NEXT: s_mul_i32 s11, s4, s11 +; VI-NEXT: s_xor_b32 s6, s6, s10 +; VI-NEXT: s_and_b32 s13, s5, 0x80 +; VI-NEXT: s_mul_i32 s12, s4, s12 +; VI-NEXT: s_xor_b32 s6, s6, s11 +; VI-NEXT: s_and_b32 s14, s5, 0x100 +; VI-NEXT: s_mul_i32 s13, s4, s13 +; VI-NEXT: s_xor_b32 s6, s6, s12 +; VI-NEXT: s_and_b32 s15, s5, 0x200 +; VI-NEXT: s_mul_i32 s14, s4, s14 +; VI-NEXT: s_xor_b32 s6, s6, s13 +; VI-NEXT: s_and_b32 s16, s5, 0x400 +; VI-NEXT: s_mul_i32 s15, s4, s15 +; VI-NEXT: s_xor_b32 s6, s6, s14 +; VI-NEXT: s_and_b32 s17, s5, 0x800 +; VI-NEXT: s_mul_i32 s16, s4, s16 +; VI-NEXT: s_xor_b32 s6, s6, s15 +; VI-NEXT: s_and_b32 s18, s5, 0x1000 +; VI-NEXT: s_mul_i32 s17, s4, s17 +; VI-NEXT: s_xor_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s19, s5, 0x2000 +; VI-NEXT: s_mul_i32 s18, s4, s18 +; VI-NEXT: s_xor_b32 s6, s6, s17 +; VI-NEXT: s_and_b32 s20, s5, 0x4000 +; VI-NEXT: s_mul_i32 s19, s4, s19 +; VI-NEXT: s_xor_b32 s6, s6, s18 +; VI-NEXT: s_and_b32 s21, s5, 0x8000 +; VI-NEXT: s_mul_i32 s20, s4, s20 +; VI-NEXT: s_xor_b32 s6, s6, s19 +; VI-NEXT: s_and_b32 s22, s5, 0x10000 +; VI-NEXT: s_mul_i32 s21, s4, s21 +; VI-NEXT: s_xor_b32 s6, s6, s20 +; VI-NEXT: s_and_b32 s23, s5, 0x20000 +; VI-NEXT: s_mul_i32 s22, s4, s22 +; VI-NEXT: s_xor_b32 s6, s6, s21 +; VI-NEXT: s_and_b32 s24, s5, 0x40000 +; VI-NEXT: s_mul_i32 s23, s4, s23 +; VI-NEXT: s_xor_b32 s6, s6, s22 +; VI-NEXT: s_and_b32 s25, s5, 0x80000 +; VI-NEXT: s_mul_i32 s24, s4, s24 +; VI-NEXT: s_xor_b32 s6, s6, s23 +; VI-NEXT: s_and_b32 s26, s5, 0x100000 +; VI-NEXT: s_mul_i32 s25, s4, s25 +; VI-NEXT: s_xor_b32 s6, s6, s24 +; VI-NEXT: s_and_b32 s27, s5, 0x200000 +; VI-NEXT: s_mul_i32 s26, s4, s26 +; VI-NEXT: s_xor_b32 s6, s6, s25 +; VI-NEXT: s_and_b32 s28, s5, 0x400000 +; VI-NEXT: s_mul_i32 s27, s4, s27 +; VI-NEXT: s_xor_b32 s6, s6, s26 +; VI-NEXT: s_and_b32 s29, s5, 0x800000 +; VI-NEXT: s_mul_i32 s28, s4, s28 +; VI-NEXT: s_xor_b32 s6, s6, s27 +; VI-NEXT: s_and_b32 s30, s5, 0x1000000 +; VI-NEXT: s_mul_i32 s29, s4, s29 +; VI-NEXT: s_xor_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s31, s5, 0x2000000 +; VI-NEXT: s_mul_i32 s30, s4, s30 +; VI-NEXT: s_xor_b32 s6, s6, s29 +; VI-NEXT: s_and_b32 s33, s5, 0x4000000 +; VI-NEXT: s_mul_i32 s31, s4, s31 +; VI-NEXT: s_xor_b32 s6, s6, s30 +; VI-NEXT: s_and_b32 s34, s5, 0x8000000 +; VI-NEXT: s_mul_i32 s33, s4, s33 +; VI-NEXT: s_xor_b32 s6, s6, s31 +; VI-NEXT: s_and_b32 s35, s5, 0x10000000 +; VI-NEXT: s_mul_i32 s34, s4, s34 +; VI-NEXT: s_xor_b32 s6, s6, s33 +; VI-NEXT: s_and_b32 s36, s5, 0x20000000 +; VI-NEXT: s_mul_i32 s35, s4, s35 +; VI-NEXT: s_xor_b32 s6, s6, s34 +; VI-NEXT: s_and_b32 s37, s5, 2.0 +; VI-NEXT: s_mul_i32 s36, s4, s36 +; VI-NEXT: s_xor_b32 s6, s6, s35 +; VI-NEXT: s_and_b32 s5, s5, 0x80000000 +; VI-NEXT: s_mul_i32 s37, s4, s37 +; VI-NEXT: s_xor_b32 s6, s6, s36 +; VI-NEXT: s_xor_b32 s6, s6, s37 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_xor_b32 s4, s6, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_clmul_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_and_b32 s6, s5, 2 +; GFX9-NEXT: s_and_b32 s7, s5, 1 +; GFX9-NEXT: s_and_b32 s8, s5, 4 +; GFX9-NEXT: s_mul_i32 s6, s4, s6 +; GFX9-NEXT: s_mul_i32 s7, s4, s7 +; GFX9-NEXT: s_and_b32 s9, s5, 8 +; GFX9-NEXT: s_mul_i32 s8, s4, s8 +; GFX9-NEXT: s_xor_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s10, s5, 16 +; GFX9-NEXT: s_mul_i32 s9, s4, s9 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s11, s5, 32 +; GFX9-NEXT: s_mul_i32 s10, s4, s10 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 +; GFX9-NEXT: s_and_b32 s12, s5, 64 +; GFX9-NEXT: s_mul_i32 s11, s4, s11 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_and_b32 s13, s5, 0x80 +; GFX9-NEXT: s_mul_i32 s12, s4, s12 +; GFX9-NEXT: s_xor_b32 s6, s6, s11 +; GFX9-NEXT: s_and_b32 s14, s5, 0x100 +; GFX9-NEXT: s_mul_i32 s13, s4, s13 +; GFX9-NEXT: s_xor_b32 s6, s6, s12 +; GFX9-NEXT: s_and_b32 s15, s5, 0x200 +; GFX9-NEXT: s_mul_i32 s14, s4, s14 +; GFX9-NEXT: s_xor_b32 s6, s6, s13 +; GFX9-NEXT: s_and_b32 s16, s5, 0x400 +; GFX9-NEXT: s_mul_i32 s15, s4, s15 +; GFX9-NEXT: s_xor_b32 s6, s6, s14 +; GFX9-NEXT: s_and_b32 s17, s5, 0x800 +; GFX9-NEXT: s_mul_i32 s16, s4, s16 +; GFX9-NEXT: s_xor_b32 s6, s6, s15 +; GFX9-NEXT: s_and_b32 s18, s5, 0x1000 +; GFX9-NEXT: s_mul_i32 s17, s4, s17 +; GFX9-NEXT: s_xor_b32 s6, s6, s16 +; GFX9-NEXT: s_and_b32 s19, s5, 0x2000 +; GFX9-NEXT: s_mul_i32 s18, s4, s18 +; GFX9-NEXT: s_xor_b32 s6, s6, s17 +; GFX9-NEXT: s_and_b32 s20, s5, 0x4000 +; GFX9-NEXT: s_mul_i32 s19, s4, s19 +; GFX9-NEXT: s_xor_b32 s6, s6, s18 +; GFX9-NEXT: s_and_b32 s21, s5, 0x8000 +; GFX9-NEXT: s_mul_i32 s20, s4, s20 +; GFX9-NEXT: s_xor_b32 s6, s6, s19 +; GFX9-NEXT: s_and_b32 s22, s5, 0x10000 +; GFX9-NEXT: s_mul_i32 s21, s4, s21 +; GFX9-NEXT: s_xor_b32 s6, s6, s20 +; GFX9-NEXT: s_and_b32 s23, s5, 0x20000 +; GFX9-NEXT: s_mul_i32 s22, s4, s22 +; GFX9-NEXT: s_xor_b32 s6, s6, s21 +; GFX9-NEXT: s_and_b32 s24, s5, 0x40000 +; GFX9-NEXT: s_mul_i32 s23, s4, s23 +; GFX9-NEXT: s_xor_b32 s6, s6, s22 +; GFX9-NEXT: s_and_b32 s25, s5, 0x80000 +; GFX9-NEXT: s_mul_i32 s24, s4, s24 +; GFX9-NEXT: s_xor_b32 s6, s6, s23 +; GFX9-NEXT: s_and_b32 s26, s5, 0x100000 +; GFX9-NEXT: s_mul_i32 s25, s4, s25 +; GFX9-NEXT: s_xor_b32 s6, s6, s24 +; GFX9-NEXT: s_and_b32 s27, s5, 0x200000 +; GFX9-NEXT: s_mul_i32 s26, s4, s26 +; GFX9-NEXT: s_xor_b32 s6, s6, s25 +; GFX9-NEXT: s_and_b32 s28, s5, 0x400000 +; GFX9-NEXT: s_mul_i32 s27, s4, s27 +; GFX9-NEXT: s_xor_b32 s6, s6, s26 +; GFX9-NEXT: s_and_b32 s29, s5, 0x800000 +; GFX9-NEXT: s_mul_i32 s28, s4, s28 +; GFX9-NEXT: s_xor_b32 s6, s6, s27 +; GFX9-NEXT: s_and_b32 s30, s5, 0x1000000 +; GFX9-NEXT: s_mul_i32 s29, s4, s29 +; GFX9-NEXT: s_xor_b32 s6, s6, s28 +; GFX9-NEXT: s_and_b32 s31, s5, 0x2000000 +; GFX9-NEXT: s_mul_i32 s30, s4, s30 +; GFX9-NEXT: s_xor_b32 s6, s6, s29 +; GFX9-NEXT: s_and_b32 s33, s5, 0x4000000 +; GFX9-NEXT: s_mul_i32 s31, s4, s31 +; GFX9-NEXT: s_xor_b32 s6, s6, s30 +; GFX9-NEXT: s_and_b32 s34, s5, 0x8000000 +; GFX9-NEXT: s_mul_i32 s33, s4, s33 +; GFX9-NEXT: s_xor_b32 s6, s6, s31 +; GFX9-NEXT: s_and_b32 s35, s5, 0x10000000 +; GFX9-NEXT: s_mul_i32 s34, s4, s34 +; GFX9-NEXT: s_xor_b32 s6, s6, s33 +; GFX9-NEXT: s_and_b32 s36, s5, 0x20000000 +; GFX9-NEXT: s_mul_i32 s35, s4, s35 +; GFX9-NEXT: s_xor_b32 s6, s6, s34 +; GFX9-NEXT: s_and_b32 s37, s5, 2.0 +; GFX9-NEXT: s_mul_i32 s36, s4, s36 +; GFX9-NEXT: s_xor_b32 s6, s6, s35 +; GFX9-NEXT: s_and_b32 s5, s5, 0x80000000 +; GFX9-NEXT: s_mul_i32 s37, s4, s37 +; GFX9-NEXT: s_xor_b32 s6, s6, s36 +; GFX9-NEXT: s_xor_b32 s6, s6, s37 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_clmul_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: s_and_b32 s4, s2, 2 +; GFX10-NEXT: s_and_b32 s5, s2, 1 +; GFX10-NEXT: s_and_b32 s8, s2, 4 +; GFX10-NEXT: s_mul_i32 s4, s3, s4 +; GFX10-NEXT: s_mul_i32 s5, s3, s5 +; GFX10-NEXT: s_and_b32 s9, s2, 8 +; GFX10-NEXT: s_mul_i32 s8, s3, s8 +; GFX10-NEXT: s_xor_b32 s4, s5, s4 +; GFX10-NEXT: s_and_b32 s10, s2, 16 +; GFX10-NEXT: s_mul_i32 s5, s3, s9 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s11, s2, 32 +; GFX10-NEXT: s_mul_i32 s8, s3, s10 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s12, s2, 64 +; GFX10-NEXT: s_mul_i32 s5, s3, s11 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s13, s2, 0x80 +; GFX10-NEXT: s_mul_i32 s8, s3, s12 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s14, s2, 0x100 +; GFX10-NEXT: s_mul_i32 s5, s3, s13 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s15, s2, 0x200 +; GFX10-NEXT: s_mul_i32 s8, s3, s14 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s16, s2, 0x400 +; GFX10-NEXT: s_mul_i32 s5, s3, s15 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s17, s2, 0x800 +; GFX10-NEXT: s_mul_i32 s8, s3, s16 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s18, s2, 0x1000 +; GFX10-NEXT: s_mul_i32 s5, s3, s17 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s19, s2, 0x2000 +; GFX10-NEXT: s_mul_i32 s8, s3, s18 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s20, s2, 0x4000 +; GFX10-NEXT: s_mul_i32 s5, s3, s19 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s21, s2, 0x8000 +; GFX10-NEXT: s_mul_i32 s8, s3, s20 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s22, s2, 0x10000 +; GFX10-NEXT: s_mul_i32 s5, s3, s21 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s23, s2, 0x20000 +; GFX10-NEXT: s_mul_i32 s8, s3, s22 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s24, s2, 0x40000 +; GFX10-NEXT: s_mul_i32 s5, s3, s23 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s25, s2, 0x80000 +; GFX10-NEXT: s_mul_i32 s8, s3, s24 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s26, s2, 0x100000 +; GFX10-NEXT: s_mul_i32 s5, s3, s25 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s27, s2, 0x200000 +; GFX10-NEXT: s_mul_i32 s8, s3, s26 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s28, s2, 0x400000 +; GFX10-NEXT: s_mul_i32 s5, s3, s27 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s29, s2, 0x800000 +; GFX10-NEXT: s_mul_i32 s8, s3, s28 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s30, s2, 0x1000000 +; GFX10-NEXT: s_mul_i32 s5, s3, s29 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s31, s2, 0x2000000 +; GFX10-NEXT: s_mul_i32 s8, s3, s30 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s33, s2, 0x4000000 +; GFX10-NEXT: s_mul_i32 s5, s3, s31 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s34, s2, 0x8000000 +; GFX10-NEXT: s_mul_i32 s8, s3, s33 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s35, s2, 0x10000000 +; GFX10-NEXT: s_mul_i32 s5, s3, s34 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s36, s2, 0x20000000 +; GFX10-NEXT: s_mul_i32 s8, s3, s35 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s37, s2, 2.0 +; GFX10-NEXT: s_mul_i32 s5, s3, s36 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_mul_i32 s8, s3, s37 +; GFX10-NEXT: s_xor_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX10-NEXT: s_xor_b32 s4, s4, s8 +; GFX10-NEXT: s_mul_i32 s3, s3, s2 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_xor_b32 s2, s4, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_clmul_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: s_and_b32 s4, s2, 2 +; GFX11-NEXT: s_and_b32 s5, s2, 1 +; GFX11-NEXT: s_and_b32 s8, s2, 4 +; GFX11-NEXT: s_mul_i32 s4, s3, s4 +; GFX11-NEXT: s_mul_i32 s5, s3, s5 +; GFX11-NEXT: s_and_b32 s9, s2, 8 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_xor_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s10, s2, 16 +; GFX11-NEXT: s_mul_i32 s5, s3, s9 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s11, s2, 32 +; GFX11-NEXT: s_mul_i32 s8, s3, s10 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s12, s2, 64 +; GFX11-NEXT: s_mul_i32 s5, s3, s11 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s13, s2, 0x80 +; GFX11-NEXT: s_mul_i32 s8, s3, s12 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s14, s2, 0x100 +; GFX11-NEXT: s_mul_i32 s5, s3, s13 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s15, s2, 0x200 +; GFX11-NEXT: s_mul_i32 s8, s3, s14 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s16, s2, 0x400 +; GFX11-NEXT: s_mul_i32 s5, s3, s15 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s17, s2, 0x800 +; GFX11-NEXT: s_mul_i32 s8, s3, s16 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s18, s2, 0x1000 +; GFX11-NEXT: s_mul_i32 s5, s3, s17 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s19, s2, 0x2000 +; GFX11-NEXT: s_mul_i32 s8, s3, s18 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s20, s2, 0x4000 +; GFX11-NEXT: s_mul_i32 s5, s3, s19 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s21, s2, 0x8000 +; GFX11-NEXT: s_mul_i32 s8, s3, s20 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s22, s2, 0x10000 +; GFX11-NEXT: s_mul_i32 s5, s3, s21 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s23, s2, 0x20000 +; GFX11-NEXT: s_mul_i32 s8, s3, s22 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s24, s2, 0x40000 +; GFX11-NEXT: s_mul_i32 s5, s3, s23 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s25, s2, 0x80000 +; GFX11-NEXT: s_mul_i32 s8, s3, s24 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s26, s2, 0x100000 +; GFX11-NEXT: s_mul_i32 s5, s3, s25 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s27, s2, 0x200000 +; GFX11-NEXT: s_mul_i32 s8, s3, s26 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s28, s2, 0x400000 +; GFX11-NEXT: s_mul_i32 s5, s3, s27 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s29, s2, 0x800000 +; GFX11-NEXT: s_mul_i32 s8, s3, s28 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s30, s2, 0x1000000 +; GFX11-NEXT: s_mul_i32 s5, s3, s29 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s31, s2, 0x2000000 +; GFX11-NEXT: s_mul_i32 s8, s3, s30 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s33, s2, 0x4000000 +; GFX11-NEXT: s_mul_i32 s5, s3, s31 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s34, s2, 0x8000000 +; GFX11-NEXT: s_mul_i32 s8, s3, s33 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s35, s2, 0x10000000 +; GFX11-NEXT: s_mul_i32 s5, s3, s34 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_and_b32 s36, s2, 0x20000000 +; GFX11-NEXT: s_mul_i32 s8, s3, s35 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s37, s2, 2.0 +; GFX11-NEXT: s_mul_i32 s5, s3, s36 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_mul_i32 s8, s3, s37 +; GFX11-NEXT: s_xor_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s4, s4, s8 +; GFX11-NEXT: s_mul_i32 s3, s3, s2 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_xor_b32 s2, s4, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_clmul_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-NEXT: s_and_b32 s4, s2, 2 +; GFX12-NEXT: s_and_b32 s5, s2, 1 +; GFX12-NEXT: s_and_b32 s8, s2, 4 +; GFX12-NEXT: s_mul_i32 s4, s3, s4 +; GFX12-NEXT: s_mul_i32 s5, s3, s5 +; GFX12-NEXT: s_and_b32 s9, s2, 8 +; GFX12-NEXT: s_mul_i32 s8, s3, s8 +; GFX12-NEXT: s_xor_b32 s4, s5, s4 +; GFX12-NEXT: s_and_b32 s10, s2, 16 +; GFX12-NEXT: s_mul_i32 s5, s3, s9 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s11, s2, 32 +; GFX12-NEXT: s_mul_i32 s8, s3, s10 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s12, s2, 64 +; GFX12-NEXT: s_mul_i32 s5, s3, s11 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s13, s2, 0x80 +; GFX12-NEXT: s_mul_i32 s8, s3, s12 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s14, s2, 0x100 +; GFX12-NEXT: s_mul_i32 s5, s3, s13 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s15, s2, 0x200 +; GFX12-NEXT: s_mul_i32 s8, s3, s14 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s16, s2, 0x400 +; GFX12-NEXT: s_mul_i32 s5, s3, s15 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s17, s2, 0x800 +; GFX12-NEXT: s_mul_i32 s8, s3, s16 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s18, s2, 0x1000 +; GFX12-NEXT: s_mul_i32 s5, s3, s17 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s19, s2, 0x2000 +; GFX12-NEXT: s_mul_i32 s8, s3, s18 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s20, s2, 0x4000 +; GFX12-NEXT: s_mul_i32 s5, s3, s19 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s21, s2, 0x8000 +; GFX12-NEXT: s_mul_i32 s8, s3, s20 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s22, s2, 0x10000 +; GFX12-NEXT: s_mul_i32 s5, s3, s21 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s23, s2, 0x20000 +; GFX12-NEXT: s_mul_i32 s8, s3, s22 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s24, s2, 0x40000 +; GFX12-NEXT: s_mul_i32 s5, s3, s23 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s25, s2, 0x80000 +; GFX12-NEXT: s_mul_i32 s8, s3, s24 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s26, s2, 0x100000 +; GFX12-NEXT: s_mul_i32 s5, s3, s25 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s27, s2, 0x200000 +; GFX12-NEXT: s_mul_i32 s8, s3, s26 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s28, s2, 0x400000 +; GFX12-NEXT: s_mul_i32 s5, s3, s27 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s29, s2, 0x800000 +; GFX12-NEXT: s_mul_i32 s8, s3, s28 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s30, s2, 0x1000000 +; GFX12-NEXT: s_mul_i32 s5, s3, s29 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s31, s2, 0x2000000 +; GFX12-NEXT: s_mul_i32 s8, s3, s30 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s33, s2, 0x4000000 +; GFX12-NEXT: s_mul_i32 s5, s3, s31 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s34, s2, 0x8000000 +; GFX12-NEXT: s_mul_i32 s8, s3, s33 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s35, s2, 0x10000000 +; GFX12-NEXT: s_mul_i32 s5, s3, s34 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_and_b32 s36, s2, 0x20000000 +; GFX12-NEXT: s_mul_i32 s8, s3, s35 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s37, s2, 2.0 +; GFX12-NEXT: s_mul_i32 s5, s3, s36 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_mul_i32 s8, s3, s37 +; GFX12-NEXT: s_xor_b32 s4, s4, s5 +; GFX12-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX12-NEXT: s_xor_b32 s4, s4, s8 +; GFX12-NEXT: s_mul_i32 s3, s3, s2 +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_xor_b32 s2, s4, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_clmul_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1250-NEXT: s_and_b32 s4, s2, 2 +; GFX1250-NEXT: s_and_b32 s5, s2, 1 +; GFX1250-NEXT: s_and_b32 s8, s2, 4 +; GFX1250-NEXT: s_mul_i32 s4, s3, s4 +; GFX1250-NEXT: s_mul_i32 s5, s3, s5 +; GFX1250-NEXT: s_and_b32 s9, s2, 8 +; GFX1250-NEXT: s_mul_i32 s8, s3, s8 +; GFX1250-NEXT: s_xor_b32 s4, s5, s4 +; GFX1250-NEXT: s_and_b32 s10, s2, 16 +; GFX1250-NEXT: s_mul_i32 s5, s3, s9 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s11, s2, 32 +; GFX1250-NEXT: s_mul_i32 s8, s3, s10 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s12, s2, 64 +; GFX1250-NEXT: s_mul_i32 s5, s3, s11 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s13, s2, 0x80 +; GFX1250-NEXT: s_mul_i32 s8, s3, s12 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s14, s2, 0x100 +; GFX1250-NEXT: s_mul_i32 s5, s3, s13 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s15, s2, 0x200 +; GFX1250-NEXT: s_mul_i32 s8, s3, s14 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s16, s2, 0x400 +; GFX1250-NEXT: s_mul_i32 s5, s3, s15 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s17, s2, 0x800 +; GFX1250-NEXT: s_mul_i32 s8, s3, s16 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s18, s2, 0x1000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s17 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s19, s2, 0x2000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s18 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s20, s2, 0x4000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s19 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s21, s2, 0x8000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s20 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s22, s2, 0x10000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s21 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s23, s2, 0x20000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s22 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s24, s2, 0x40000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s23 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s25, s2, 0x80000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s24 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s26, s2, 0x100000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s25 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s27, s2, 0x200000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s26 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s28, s2, 0x400000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s27 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s29, s2, 0x800000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s28 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s30, s2, 0x1000000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s29 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s31, s2, 0x2000000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s30 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s33, s2, 0x4000000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s31 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s34, s2, 0x8000000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s33 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s35, s2, 0x10000000 +; GFX1250-NEXT: s_mul_i32 s5, s3, s34 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_and_b32 s36, s2, 0x20000000 +; GFX1250-NEXT: s_mul_i32 s8, s3, s35 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s37, s2, 2.0 +; GFX1250-NEXT: s_mul_i32 s5, s3, s36 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_mul_i32 s8, s3, s37 +; GFX1250-NEXT: s_xor_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX1250-NEXT: s_xor_b32 s4, s4, s8 +; GFX1250-NEXT: s_mul_i32 s3, s3, s2 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_xor_b32 s2, s4, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; +; EG-LABEL: test_clmul_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 116, @9, KC0[], KC1[] +; EG-NEXT: ALU 10, @126, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT T0.W, T0.Y, 1, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.W, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: MULLO_INT * T1.X, T0.X, T1.W, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T1.W, T0.Z, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Z, T0.X, PV.Z, +; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.Z, T0.Y, literal.x, +; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 126: +; EG-NEXT: XOR_INT T0.W, T0.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: MULLO_INT * T0.Z, T0.X, T1.Z, +; EG-NEXT: AND_INT T1.Z, T0.Y, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.Y, T0.X, T2.Z, +; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00) +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.X, PV.Z, +; EG-NEXT: XOR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 + %a = load i32, ptr addrspace(1) %in + %b = load i32, ptr addrspace(1) %b_ptr + %res = call i32 @llvm.clmul.i32(i32 %a, i32 %b) + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_clmulr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_clmulr_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s7, 0 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s19, s7 +; SI-NEXT: s_mov_b32 s23, s7 +; SI-NEXT: s_mov_b32 s25, s7 +; SI-NEXT: s_mov_b32 s27, s7 +; SI-NEXT: s_mov_b32 s29, s7 +; SI-NEXT: s_mov_b32 s31, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s37, s7 +; SI-NEXT: s_mov_b32 s39, s7 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_mov_b32 s43, s7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s33, v1 +; SI-NEXT: s_and_b32 s20, s33, 2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_bfe_i32 s8, s33, 0x10000 +; SI-NEXT: v_cmp_eq_u64_e64 s[20:21], s[20:21], 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; SI-NEXT: s_and_b32 s8, s8, s6 +; SI-NEXT: s_and_b64 s[20:21], s[20:21], exec +; SI-NEXT: s_cselect_b32 s21, 0, s5 +; SI-NEXT: s_cselect_b32 s20, 0, s4 +; SI-NEXT: s_and_b32 s14, s33, 4 +; SI-NEXT: s_xor_b64 s[20:21], s[8:9], s[20:21] +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 2 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s15, 0, s15 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_and_b32 s10, s33, 8 +; SI-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; SI-NEXT: s_xor_b64 s[14:15], s[20:21], s[14:15] +; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 3 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s11, 0, s21 +; SI-NEXT: s_cselect_b32 s10, 0, s20 +; SI-NEXT: s_and_b32 s12, s33, 16 +; SI-NEXT: v_cmp_eq_u64_e64 s[12:13], s[12:13], 0 +; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 4 +; SI-NEXT: s_xor_b64 s[10:11], s[14:15], s[10:11] +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cselect_b32 s13, 0, s21 +; SI-NEXT: s_cselect_b32 s12, 0, s20 +; SI-NEXT: s_and_b32 s16, s33, 32 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[16:17], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 5 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s18, s33, 64 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[18:19], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 6 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s22, s33, 0x80 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[22:23], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 7 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s24, s33, 0x100 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[24:25], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 8 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s26, s33, 0x200 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[26:27], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 9 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s28, s33, 0x400 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[28:29], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 10 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s30, s33, 0x800 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[30:31], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 11 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s34, s33, 0x1000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[34:35], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 12 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s36, s33, 0x2000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[36:37], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 13 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s38, s33, 0x4000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[38:39], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 14 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s40, s33, 0x8000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[40:41], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 15 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s42, s33, 0x10000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[42:43], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s4, s33, 0x20000 +; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 17 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_cselect_b32 s5, 0, s15 +; SI-NEXT: s_cselect_b32 s4, 0, s14 +; SI-NEXT: s_and_b32 s8, s33, 0x40000 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18 +; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s13 +; SI-NEXT: s_cselect_b32 s8, 0, s12 +; SI-NEXT: s_and_b32 s10, s33, 0x80000 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[10:11], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 19 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x100000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x200000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x400000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x800000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x1000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x2000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x4000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x8000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x10000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x20000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 2.0 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31 +; SI-NEXT: s_cmp_gt_i32 s33, -1 +; SI-NEXT: s_cselect_b32 s7, 0, s7 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_clmulr_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s7, 0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s7 +; VI-NEXT: s_mov_b32 s21, s7 +; VI-NEXT: s_mov_b32 s23, s7 +; VI-NEXT: s_mov_b32 s25, s7 +; VI-NEXT: s_mov_b32 s27, s7 +; VI-NEXT: s_mov_b32 s29, s7 +; VI-NEXT: s_mov_b32 s31, s7 +; VI-NEXT: s_mov_b32 s35, s7 +; VI-NEXT: s_mov_b32 s37, s7 +; VI-NEXT: s_mov_b32 s39, s7 +; VI-NEXT: s_mov_b32 s41, s7 +; VI-NEXT: s_mov_b32 s43, s7 +; VI-NEXT: s_mov_b32 s45, s7 +; VI-NEXT: s_mov_b32 s47, s7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_bfe_i32 s5, s4, 0x10000 +; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 1 +; VI-NEXT: s_and_b32 s10, s4, 2 +; VI-NEXT: s_and_b32 s8, s5, s6 +; VI-NEXT: s_cmp_eq_u64 s[10:11], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s49 +; VI-NEXT: s_cselect_b32 s10, 0, s48 +; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 2 +; VI-NEXT: s_and_b32 s12, s4, 4 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s49 +; VI-NEXT: s_cselect_b32 s10, 0, s48 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 3 +; VI-NEXT: s_and_b32 s14, s4, 8 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[14:15], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 4 +; VI-NEXT: s_and_b32 s16, s4, 16 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[16:17], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; VI-NEXT: s_and_b32 s18, s4, 32 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[18:19], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 6 +; VI-NEXT: s_and_b32 s20, s4, 64 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[20:21], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 7 +; VI-NEXT: s_and_b32 s22, s4, 0x80 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[22:23], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8 +; VI-NEXT: s_and_b32 s24, s4, 0x100 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[24:25], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 9 +; VI-NEXT: s_and_b32 s26, s4, 0x200 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[26:27], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 10 +; VI-NEXT: s_and_b32 s28, s4, 0x400 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[28:29], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 11 +; VI-NEXT: s_and_b32 s30, s4, 0x800 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[30:31], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 12 +; VI-NEXT: s_and_b32 s34, s4, 0x1000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[34:35], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 13 +; VI-NEXT: s_and_b32 s36, s4, 0x2000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[36:37], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 14 +; VI-NEXT: s_and_b32 s38, s4, 0x4000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[38:39], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 15 +; VI-NEXT: s_and_b32 s40, s4, 0x8000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[40:41], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 16 +; VI-NEXT: s_and_b32 s42, s4, 0x10000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[42:43], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 17 +; VI-NEXT: s_and_b32 s44, s4, 0x20000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[44:45], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18 +; VI-NEXT: s_and_b32 s46, s4, 0x40000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[46:47], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 19 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_and_b32 s10, s4, 0x80000 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_cmp_eq_u64 s[10:11], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20 +; VI-NEXT: s_and_b32 s12, s4, 0x100000 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21 +; VI-NEXT: s_and_b32 s12, s4, 0x200000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22 +; VI-NEXT: s_and_b32 s12, s4, 0x400000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23 +; VI-NEXT: s_and_b32 s12, s4, 0x800000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24 +; VI-NEXT: s_and_b32 s12, s4, 0x1000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25 +; VI-NEXT: s_and_b32 s12, s4, 0x2000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26 +; VI-NEXT: s_and_b32 s12, s4, 0x4000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27 +; VI-NEXT: s_and_b32 s12, s4, 0x8000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28 +; VI-NEXT: s_and_b32 s12, s4, 0x10000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29 +; VI-NEXT: s_and_b32 s12, s4, 0x20000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30 +; VI-NEXT: s_and_b32 s12, s4, 2.0 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31 +; VI-NEXT: s_cmp_gt_i32 s4, -1 +; VI-NEXT: s_cselect_b32 s5, 0, s7 +; VI-NEXT: s_cselect_b32 s4, 0, s6 +; VI-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_clmulr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s19, s5 +; GFX9-NEXT: s_mov_b32 s21, s5 +; GFX9-NEXT: s_mov_b32 s23, s5 +; GFX9-NEXT: s_mov_b32 s25, s5 +; GFX9-NEXT: s_mov_b32 s27, s5 +; GFX9-NEXT: s_mov_b32 s29, s5 +; GFX9-NEXT: s_mov_b32 s31, s5 +; GFX9-NEXT: s_mov_b32 s35, s5 +; GFX9-NEXT: s_mov_b32 s37, s5 +; GFX9-NEXT: s_mov_b32 s39, s5 +; GFX9-NEXT: s_mov_b32 s41, s5 +; GFX9-NEXT: s_mov_b32 s43, s5 +; GFX9-NEXT: s_mov_b32 s45, s5 +; GFX9-NEXT: s_mov_b32 s47, s5 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_bfe_i32 s6, s8, 0x10000 +; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 1 +; GFX9-NEXT: s_and_b32 s10, s8, 2 +; GFX9-NEXT: s_and_b32 s6, s6, s4 +; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s49 +; GFX9-NEXT: s_cselect_b32 s10, 0, s48 +; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 2 +; GFX9-NEXT: s_and_b32 s12, s8, 4 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s49 +; GFX9-NEXT: s_cselect_b32 s10, 0, s48 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 3 +; GFX9-NEXT: s_and_b32 s14, s8, 8 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 4 +; GFX9-NEXT: s_and_b32 s16, s8, 16 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX9-NEXT: s_and_b32 s18, s8, 32 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[18:19], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 6 +; GFX9-NEXT: s_and_b32 s20, s8, 64 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[20:21], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 7 +; GFX9-NEXT: s_and_b32 s22, s8, 0x80 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[22:23], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 8 +; GFX9-NEXT: s_and_b32 s24, s8, 0x100 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[24:25], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 9 +; GFX9-NEXT: s_and_b32 s26, s8, 0x200 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[26:27], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 10 +; GFX9-NEXT: s_and_b32 s28, s8, 0x400 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[28:29], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 11 +; GFX9-NEXT: s_and_b32 s30, s8, 0x800 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[30:31], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 12 +; GFX9-NEXT: s_and_b32 s34, s8, 0x1000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[34:35], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 13 +; GFX9-NEXT: s_and_b32 s36, s8, 0x2000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[36:37], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 14 +; GFX9-NEXT: s_and_b32 s38, s8, 0x4000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[38:39], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 15 +; GFX9-NEXT: s_and_b32 s40, s8, 0x8000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[40:41], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 16 +; GFX9-NEXT: s_and_b32 s42, s8, 0x10000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[42:43], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 17 +; GFX9-NEXT: s_and_b32 s44, s8, 0x20000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[44:45], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 18 +; GFX9-NEXT: s_and_b32 s46, s8, 0x40000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[46:47], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 19 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_and_b32 s10, s8, 0x80000 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 20 +; GFX9-NEXT: s_and_b32 s12, s8, 0x100000 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 21 +; GFX9-NEXT: s_and_b32 s12, s8, 0x200000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 22 +; GFX9-NEXT: s_and_b32 s12, s8, 0x400000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 23 +; GFX9-NEXT: s_and_b32 s12, s8, 0x800000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 24 +; GFX9-NEXT: s_and_b32 s12, s8, 0x1000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 25 +; GFX9-NEXT: s_and_b32 s12, s8, 0x2000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 26 +; GFX9-NEXT: s_and_b32 s12, s8, 0x4000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 27 +; GFX9-NEXT: s_and_b32 s12, s8, 0x8000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 28 +; GFX9-NEXT: s_and_b32 s12, s8, 0x10000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 29 +; GFX9-NEXT: s_and_b32 s12, s8, 0x20000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 30 +; GFX9-NEXT: s_and_b32 s12, s8, 2.0 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 +; GFX9-NEXT: s_cmp_gt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s5, 0, s5 +; GFX9-NEXT: s_cselect_b32 s4, 0, s4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_clmulr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: s_bfe_i32 s5, s4, 0x10000 +; GFX10-NEXT: s_and_b32 s10, s4, 2 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s8, s5, s2 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_cselect_b32 s13, 0, s13 +; GFX10-NEXT: s_cselect_b32 s12, 0, s12 +; GFX10-NEXT: s_and_b32 s10, s4, 4 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 2 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 8 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 3 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 16 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 4 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 32 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 5 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 64 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 6 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x80 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 7 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x100 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 8 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x200 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 9 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x400 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 10 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x800 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 11 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x1000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 12 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x2000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 13 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x4000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 14 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x8000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 15 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x10000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 16 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x20000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 17 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x40000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 18 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x80000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 19 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x100000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 20 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x200000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 21 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x400000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 22 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x800000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 23 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x1000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 24 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x2000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 25 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x4000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 26 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x8000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 27 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x10000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 28 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x20000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 29 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 2.0 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 30 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s11, 0, s15 +; GFX10-NEXT: s_cselect_b32 s10, 0, s14 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_cmp_gt_i32 s4, -1 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_cselect_b32 s3, 0, s3 +; GFX10-NEXT: s_cselect_b32 s2, 0, s2 +; GFX10-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_clmulr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: s_bfe_i32 s5, s4, 0x10000 +; GFX11-NEXT: s_and_b32 s10, s4, 2 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX11-NEXT: s_and_b32 s8, s5, s2 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_cselect_b32 s13, 0, s13 +; GFX11-NEXT: s_cselect_b32 s12, 0, s12 +; GFX11-NEXT: s_and_b32 s10, s4, 4 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 2 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 8 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 3 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 16 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 4 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 32 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 5 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 64 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 6 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x80 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 7 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x100 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 8 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x200 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 9 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x400 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 10 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x800 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 11 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x1000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 12 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x2000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 13 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x4000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 14 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x8000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 15 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x10000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 16 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x20000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 17 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x40000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 18 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x80000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 19 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x100000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 20 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x200000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 21 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x400000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 22 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x800000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 23 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x1000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 24 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x2000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 25 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x4000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 26 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x8000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 27 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x10000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 28 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x20000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 29 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 2.0 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 30 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s11, 0, s15 +; GFX11-NEXT: s_cselect_b32 s10, 0, s14 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_cmp_gt_i32 s4, -1 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_cselect_b32 s3, 0, s3 +; GFX11-NEXT: s_cselect_b32 s2, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_clmulr_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s3 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s21, s3 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_mov_b32 s25, s3 +; GFX12-NEXT: s_mov_b32 s27, s3 +; GFX12-NEXT: s_mov_b32 s29, s3 +; GFX12-NEXT: s_mov_b32 s31, s3 +; GFX12-NEXT: s_mov_b32 s35, s3 +; GFX12-NEXT: s_mov_b32 s37, s3 +; GFX12-NEXT: s_mov_b32 s39, s3 +; GFX12-NEXT: s_mov_b32 s41, s3 +; GFX12-NEXT: s_mov_b32 s43, s3 +; GFX12-NEXT: s_mov_b32 s45, s3 +; GFX12-NEXT: s_mov_b32 s47, s3 +; GFX12-NEXT: s_mov_b32 s49, s3 +; GFX12-NEXT: s_mov_b32 s51, s3 +; GFX12-NEXT: s_mov_b32 s53, s3 +; GFX12-NEXT: s_mov_b32 s55, s3 +; GFX12-NEXT: s_mov_b32 s57, s3 +; GFX12-NEXT: s_mov_b32 s59, s3 +; GFX12-NEXT: s_mov_b32 s61, s3 +; GFX12-NEXT: s_mov_b32 s63, s3 +; GFX12-NEXT: s_mov_b32 s65, s3 +; GFX12-NEXT: s_mov_b32 s67, s3 +; GFX12-NEXT: s_mov_b32 s69, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s33, v1 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: s_and_b32 s4, s33, 2 +; GFX12-NEXT: s_and_b32 s8, s33, 1 +; GFX12-NEXT: s_and_b32 s10, s33, 4 +; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[8:9] +; GFX12-NEXT: s_and_b32 s12, s33, 8 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11] +; GFX12-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX12-NEXT: s_and_b32 s14, s33, 16 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s16, s33, 32 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s18, s33, 64 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[16:17] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s20, s33, 0x80 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[18:19] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s22, s33, 0x100 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s24, s33, 0x200 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s26, s33, 0x400 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[24:25] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s28, s33, 0x800 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[26:27] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s30, s33, 0x1000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s34, s33, 0x2000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s36, s33, 0x4000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[34:35] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s38, s33, 0x8000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[36:37] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s40, s33, 0x10000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s42, s33, 0x20000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s44, s33, 0x40000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[42:43] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s46, s33, 0x80000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[44:45] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s48, s33, 0x100000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s50, s33, 0x200000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s52, s33, 0x400000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[50:51] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s54, s33, 0x800000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[52:53] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s56, s33, 0x1000000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s58, s33, 0x2000000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s60, s33, 0x4000000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[58:59] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s62, s33, 0x8000000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[60:61] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s64, s33, 0x10000000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s66, s33, 0x20000000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s68, s33, 2.0 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[66:67] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[68:69] +; GFX12-NEXT: s_and_b32 s12, s33, 0x80000000 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0 +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX12-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_clmulr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b64 s[8:9], 0x80000000 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-NEXT: s_and_b64 s[10:11], s[4:5], 2 +; GFX1250-NEXT: s_and_b64 s[12:13], s[4:5], 1 +; GFX1250-NEXT: s_and_b64 s[14:15], s[4:5], 4 +; GFX1250-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11] +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13] +; GFX1250-NEXT: s_and_b64 s[16:17], s[4:5], 8 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[12:13], s[10:11] +; GFX1250-NEXT: s_and_b64 s[18:19], s[4:5], 16 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[16:17] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[20:21], s[4:5], 32 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[18:19] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[22:23], s[4:5], 64 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[24:25], s[4:5], 0x80 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[26:27], s[4:5], 0x100 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[24:25] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[28:29], s[4:5], 0x200 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[26:27] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[30:31], s[4:5], 0x400 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[34:35], s[4:5], 0x800 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[36:37], s[4:5], 0x1000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[34:35] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[38:39], s[4:5], 0x2000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[36:37] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[40:41], s[4:5], 0x4000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[42:43], s[4:5], 0x8000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[44:45], s[4:5], 0x10000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[42:43] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[46:47], s[4:5], 0x20000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[44:45] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[48:49], s[4:5], 0x40000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[50:51], s[4:5], 0x80000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[52:53], s[4:5], 0x100000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[50:51] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[54:55], s[4:5], 0x200000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[52:53] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[56:57], s[4:5], 0x400000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[58:59], s[4:5], 0x800000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[60:61], s[4:5], 0x1000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[58:59] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[62:63], s[4:5], 0x2000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[60:61] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[64:65], s[4:5], 0x4000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[66:67], s[4:5], 0x8000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[68:69], s[4:5], 0x10000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[66:67] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[70:71], s[4:5], 0x20000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[68:69] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[72:73], s[4:5], 0x40000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[70:71] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[72:73] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX1250-NEXT: s_xor_b64 s[8:9], s[10:11], s[14:15] +; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; +; EG-LABEL: test_clmulr_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 98, @11, KC0[], KC1[] +; EG-NEXT: ALU 110, @110, KC0[], KC1[] +; EG-NEXT: ALU 12, @221, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, PV.W, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T0.Y, literal.y, +; EG-NEXT: 65280(9.147676e-41), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.X, PS, literal.x, +; EG-NEXT: LSHR T1.Y, T0.Y, literal.y, +; EG-NEXT: LSHL T1.Z, PV.W, literal.z, +; EG-NEXT: LSHL T0.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 65280(9.147676e-41), 24(3.363116e-44) +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, T0.X, literal.x, +; EG-NEXT: LSHL T0.Y, PS, literal.y, +; EG-NEXT: LSHL T2.Z, T0.X, literal.x, +; EG-NEXT: OR_INT T0.W, PV.W, PV.Z, +; EG-NEXT: OR_INT * T1.W, PV.X, PV.Y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: OR_INT T1.Z, PV.W, PS, +; EG-NEXT: OR_INT T0.W, PV.Z, PV.Y, +; EG-NEXT: OR_INT * T1.W, T0.Z, PV.X, +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, PV.W, literal.y, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.Z, PS, literal.x, +; EG-NEXT: LSHL T0.W, PV.W, literal.y, +; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T1.Z, PS, literal.x, +; EG-NEXT: LSHR T1.W, PS, literal.y, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHR T0.Z, PS, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: LSHL * T1.W, PV.Z, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT T1.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHL * T1.W, PV.Y, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, PV.W, 1, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, PS, literal.x, +; EG-NEXT: LSHL T0.W, PV.W, 1, +; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PS, 1, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: AND_INT T3.W, T1.W, literal.x, +; EG-NEXT: MULLO_INT * T0.X, PS, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, T1.W, literal.x, +; EG-NEXT: MULLO_INT * T0.Y, T0.W, PV.W, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, T0.X, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.Z, T1.W, literal.x, +; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 110: +; EG-NEXT: XOR_INT T3.W, T2.W, T0.X, +; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.Z, +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, T1.Z, +; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T1.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00) +; EG-NEXT: XOR_INT T1.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: LSHR T0.Z, T3.W, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T3.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 65280(9.147676e-41) +; EG-NEXT: LSHL T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, T2.W, literal.y, +; EG-NEXT: LSHR T0.W, PV.W, literal.y, +; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: 65280(9.147676e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.W, PS, PV.W, +; EG-NEXT: OR_INT * T1.W, PV.Z, PV.Y, +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: ALU clause starting at 221: +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, PV.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 + %a = load i32, ptr addrspace(1) %in + %b = load i32, ptr addrspace(1) %b_ptr + %a.ext = zext i32 %a to i64 + %b.ext = zext i32 %b to i64 + %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext) + %res.ext = lshr i64 %clmul, 31 + %res = trunc i64 %res.ext to i32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_clmulh_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: test_clmulh_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s7, 0 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s19, s7 +; SI-NEXT: s_mov_b32 s23, s7 +; SI-NEXT: s_mov_b32 s25, s7 +; SI-NEXT: s_mov_b32 s27, s7 +; SI-NEXT: s_mov_b32 s29, s7 +; SI-NEXT: s_mov_b32 s31, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s37, s7 +; SI-NEXT: s_mov_b32 s39, s7 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_mov_b32 s43, s7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s33, v1 +; SI-NEXT: s_and_b32 s20, s33, 2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_bfe_i32 s8, s33, 0x10000 +; SI-NEXT: v_cmp_eq_u64_e64 s[20:21], s[20:21], 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; SI-NEXT: s_and_b32 s8, s8, s6 +; SI-NEXT: s_and_b64 s[20:21], s[20:21], exec +; SI-NEXT: s_cselect_b32 s21, 0, s5 +; SI-NEXT: s_cselect_b32 s20, 0, s4 +; SI-NEXT: s_and_b32 s14, s33, 4 +; SI-NEXT: s_xor_b64 s[20:21], s[8:9], s[20:21] +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 2 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s15, 0, s15 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_and_b32 s10, s33, 8 +; SI-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; SI-NEXT: s_xor_b64 s[14:15], s[20:21], s[14:15] +; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 3 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s11, 0, s21 +; SI-NEXT: s_cselect_b32 s10, 0, s20 +; SI-NEXT: s_and_b32 s12, s33, 16 +; SI-NEXT: v_cmp_eq_u64_e64 s[12:13], s[12:13], 0 +; SI-NEXT: s_lshl_b64 s[20:21], s[6:7], 4 +; SI-NEXT: s_xor_b64 s[10:11], s[14:15], s[10:11] +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cselect_b32 s13, 0, s21 +; SI-NEXT: s_cselect_b32 s12, 0, s20 +; SI-NEXT: s_and_b32 s16, s33, 32 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[16:17], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 5 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s18, s33, 64 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[18:19], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 6 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s22, s33, 0x80 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[22:23], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 7 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s24, s33, 0x100 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[24:25], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 8 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s26, s33, 0x200 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[26:27], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 9 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s28, s33, 0x400 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[28:29], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 10 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s30, s33, 0x800 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[30:31], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 11 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s34, s33, 0x1000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[34:35], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 12 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s36, s33, 0x2000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[36:37], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 13 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s38, s33, 0x4000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[38:39], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 14 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s40, s33, 0x8000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[40:41], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 15 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s42, s33, 0x10000 +; SI-NEXT: v_cmp_eq_u64_e64 s[16:17], s[42:43], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], s[16:17], exec +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_cselect_b32 s13, 0, s15 +; SI-NEXT: s_cselect_b32 s12, 0, s14 +; SI-NEXT: s_and_b32 s4, s33, 0x20000 +; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 +; SI-NEXT: s_lshl_b64 s[14:15], s[6:7], 17 +; SI-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_cselect_b32 s5, 0, s15 +; SI-NEXT: s_cselect_b32 s4, 0, s14 +; SI-NEXT: s_and_b32 s8, s33, 0x40000 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18 +; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s13 +; SI-NEXT: s_cselect_b32 s8, 0, s12 +; SI-NEXT: s_and_b32 s10, s33, 0x80000 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[10:11], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 19 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x100000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x200000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x400000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x800000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x1000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x2000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x4000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x8000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x10000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 0x20000000 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_and_b32 s8, s33, 2.0 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: v_cmp_eq_u64_e64 s[8:9], s[8:9], 0 +; SI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cselect_b32 s9, 0, s11 +; SI-NEXT: s_cselect_b32 s8, 0, s10 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31 +; SI-NEXT: s_cmp_gt_i32 s33, -1 +; SI-NEXT: s_cselect_b32 s7, 0, s7 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_clmulh_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s7, 0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s7 +; VI-NEXT: s_mov_b32 s21, s7 +; VI-NEXT: s_mov_b32 s23, s7 +; VI-NEXT: s_mov_b32 s25, s7 +; VI-NEXT: s_mov_b32 s27, s7 +; VI-NEXT: s_mov_b32 s29, s7 +; VI-NEXT: s_mov_b32 s31, s7 +; VI-NEXT: s_mov_b32 s35, s7 +; VI-NEXT: s_mov_b32 s37, s7 +; VI-NEXT: s_mov_b32 s39, s7 +; VI-NEXT: s_mov_b32 s41, s7 +; VI-NEXT: s_mov_b32 s43, s7 +; VI-NEXT: s_mov_b32 s45, s7 +; VI-NEXT: s_mov_b32 s47, s7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_bfe_i32 s5, s4, 0x10000 +; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 1 +; VI-NEXT: s_and_b32 s10, s4, 2 +; VI-NEXT: s_and_b32 s8, s5, s6 +; VI-NEXT: s_cmp_eq_u64 s[10:11], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s49 +; VI-NEXT: s_cselect_b32 s10, 0, s48 +; VI-NEXT: s_lshl_b64 s[48:49], s[6:7], 2 +; VI-NEXT: s_and_b32 s12, s4, 4 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s49 +; VI-NEXT: s_cselect_b32 s10, 0, s48 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 3 +; VI-NEXT: s_and_b32 s14, s4, 8 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[14:15], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 4 +; VI-NEXT: s_and_b32 s16, s4, 16 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[16:17], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; VI-NEXT: s_and_b32 s18, s4, 32 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[18:19], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 6 +; VI-NEXT: s_and_b32 s20, s4, 64 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[20:21], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 7 +; VI-NEXT: s_and_b32 s22, s4, 0x80 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[22:23], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 8 +; VI-NEXT: s_and_b32 s24, s4, 0x100 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[24:25], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 9 +; VI-NEXT: s_and_b32 s26, s4, 0x200 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[26:27], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 10 +; VI-NEXT: s_and_b32 s28, s4, 0x400 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[28:29], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 11 +; VI-NEXT: s_and_b32 s30, s4, 0x800 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[30:31], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 12 +; VI-NEXT: s_and_b32 s34, s4, 0x1000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[34:35], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 13 +; VI-NEXT: s_and_b32 s36, s4, 0x2000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[36:37], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 14 +; VI-NEXT: s_and_b32 s38, s4, 0x4000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[38:39], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 15 +; VI-NEXT: s_and_b32 s40, s4, 0x8000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[40:41], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 16 +; VI-NEXT: s_and_b32 s42, s4, 0x10000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[42:43], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 17 +; VI-NEXT: s_and_b32 s44, s4, 0x20000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[44:45], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 18 +; VI-NEXT: s_and_b32 s46, s4, 0x40000 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_cmp_eq_u64 s[46:47], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_lshl_b64 s[12:13], s[6:7], 19 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_and_b32 s10, s4, 0x80000 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_cmp_eq_u64 s[10:11], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s13 +; VI-NEXT: s_cselect_b32 s10, 0, s12 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 20 +; VI-NEXT: s_and_b32 s12, s4, 0x100000 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 21 +; VI-NEXT: s_and_b32 s12, s4, 0x200000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 22 +; VI-NEXT: s_and_b32 s12, s4, 0x400000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 23 +; VI-NEXT: s_and_b32 s12, s4, 0x800000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 24 +; VI-NEXT: s_and_b32 s12, s4, 0x1000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 25 +; VI-NEXT: s_and_b32 s12, s4, 0x2000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 26 +; VI-NEXT: s_and_b32 s12, s4, 0x4000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 27 +; VI-NEXT: s_and_b32 s12, s4, 0x8000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 28 +; VI-NEXT: s_and_b32 s12, s4, 0x10000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 29 +; VI-NEXT: s_and_b32 s12, s4, 0x20000000 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[10:11], s[6:7], 30 +; VI-NEXT: s_and_b32 s12, s4, 2.0 +; VI-NEXT: s_cmp_eq_u64 s[12:13], 0 +; VI-NEXT: s_cselect_b32 s11, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 0, s10 +; VI-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 31 +; VI-NEXT: s_cmp_gt_i32 s4, -1 +; VI-NEXT: s_cselect_b32 s5, 0, s7 +; VI-NEXT: s_cselect_b32 s4, 0, s6 +; VI-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_clmulh_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_mov_b32 s7, s5 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_mov_b32 s15, s5 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s19, s5 +; GFX9-NEXT: s_mov_b32 s21, s5 +; GFX9-NEXT: s_mov_b32 s23, s5 +; GFX9-NEXT: s_mov_b32 s25, s5 +; GFX9-NEXT: s_mov_b32 s27, s5 +; GFX9-NEXT: s_mov_b32 s29, s5 +; GFX9-NEXT: s_mov_b32 s31, s5 +; GFX9-NEXT: s_mov_b32 s35, s5 +; GFX9-NEXT: s_mov_b32 s37, s5 +; GFX9-NEXT: s_mov_b32 s39, s5 +; GFX9-NEXT: s_mov_b32 s41, s5 +; GFX9-NEXT: s_mov_b32 s43, s5 +; GFX9-NEXT: s_mov_b32 s45, s5 +; GFX9-NEXT: s_mov_b32 s47, s5 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_bfe_i32 s6, s8, 0x10000 +; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 1 +; GFX9-NEXT: s_and_b32 s10, s8, 2 +; GFX9-NEXT: s_and_b32 s6, s6, s4 +; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s49 +; GFX9-NEXT: s_cselect_b32 s10, 0, s48 +; GFX9-NEXT: s_lshl_b64 s[48:49], s[4:5], 2 +; GFX9-NEXT: s_and_b32 s12, s8, 4 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s49 +; GFX9-NEXT: s_cselect_b32 s10, 0, s48 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 3 +; GFX9-NEXT: s_and_b32 s14, s8, 8 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 4 +; GFX9-NEXT: s_and_b32 s16, s8, 16 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX9-NEXT: s_and_b32 s18, s8, 32 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[18:19], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 6 +; GFX9-NEXT: s_and_b32 s20, s8, 64 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[20:21], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 7 +; GFX9-NEXT: s_and_b32 s22, s8, 0x80 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[22:23], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 8 +; GFX9-NEXT: s_and_b32 s24, s8, 0x100 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[24:25], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 9 +; GFX9-NEXT: s_and_b32 s26, s8, 0x200 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[26:27], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 10 +; GFX9-NEXT: s_and_b32 s28, s8, 0x400 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[28:29], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 11 +; GFX9-NEXT: s_and_b32 s30, s8, 0x800 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[30:31], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 12 +; GFX9-NEXT: s_and_b32 s34, s8, 0x1000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[34:35], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 13 +; GFX9-NEXT: s_and_b32 s36, s8, 0x2000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[36:37], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 14 +; GFX9-NEXT: s_and_b32 s38, s8, 0x4000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[38:39], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 15 +; GFX9-NEXT: s_and_b32 s40, s8, 0x8000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[40:41], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 16 +; GFX9-NEXT: s_and_b32 s42, s8, 0x10000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[42:43], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 17 +; GFX9-NEXT: s_and_b32 s44, s8, 0x20000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[44:45], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 18 +; GFX9-NEXT: s_and_b32 s46, s8, 0x40000 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u64 s[46:47], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], 19 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_and_b32 s10, s8, 0x80000 +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s13 +; GFX9-NEXT: s_cselect_b32 s10, 0, s12 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 20 +; GFX9-NEXT: s_and_b32 s12, s8, 0x100000 +; GFX9-NEXT: s_mov_b32 s13, s5 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 21 +; GFX9-NEXT: s_and_b32 s12, s8, 0x200000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 22 +; GFX9-NEXT: s_and_b32 s12, s8, 0x400000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 23 +; GFX9-NEXT: s_and_b32 s12, s8, 0x800000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 24 +; GFX9-NEXT: s_and_b32 s12, s8, 0x1000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 25 +; GFX9-NEXT: s_and_b32 s12, s8, 0x2000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 26 +; GFX9-NEXT: s_and_b32 s12, s8, 0x4000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 27 +; GFX9-NEXT: s_and_b32 s12, s8, 0x8000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 28 +; GFX9-NEXT: s_and_b32 s12, s8, 0x10000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 29 +; GFX9-NEXT: s_and_b32 s12, s8, 0x20000000 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], 30 +; GFX9-NEXT: s_and_b32 s12, s8, 2.0 +; GFX9-NEXT: s_cmp_eq_u64 s[12:13], 0 +; GFX9-NEXT: s_cselect_b32 s11, 0, s11 +; GFX9-NEXT: s_cselect_b32 s10, 0, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 +; GFX9-NEXT: s_cmp_gt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s5, 0, s5 +; GFX9-NEXT: s_cselect_b32 s4, 0, s4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_clmulh_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: s_bfe_i32 s5, s4, 0x10000 +; GFX10-NEXT: s_and_b32 s10, s4, 2 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s8, s5, s2 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_cselect_b32 s13, 0, s13 +; GFX10-NEXT: s_cselect_b32 s12, 0, s12 +; GFX10-NEXT: s_and_b32 s10, s4, 4 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 2 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 8 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 3 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 16 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 4 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 32 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 5 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 64 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 6 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x80 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 7 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x100 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 8 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x200 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 9 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x400 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 10 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x800 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 11 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x1000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 12 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x2000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 13 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x4000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 14 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x8000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 15 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x10000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 16 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x20000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 17 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x40000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 18 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x80000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 19 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x100000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 20 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x200000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 21 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x400000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 22 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x800000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 23 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x1000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 24 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x2000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 25 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x4000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 26 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x8000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 27 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x10000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 28 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 0x20000000 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 29 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s13, 0, s15 +; GFX10-NEXT: s_cselect_b32 s12, 0, s14 +; GFX10-NEXT: s_and_b32 s10, s4, 2.0 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 30 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: s_cselect_b32 s11, 0, s15 +; GFX10-NEXT: s_cselect_b32 s10, 0, s14 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_cmp_gt_i32 s4, -1 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_cselect_b32 s3, 0, s3 +; GFX10-NEXT: s_cselect_b32 s2, 0, s2 +; GFX10-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_clmulh_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: s_bfe_i32 s5, s4, 0x10000 +; GFX11-NEXT: s_and_b32 s10, s4, 2 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX11-NEXT: s_and_b32 s8, s5, s2 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_cselect_b32 s13, 0, s13 +; GFX11-NEXT: s_cselect_b32 s12, 0, s12 +; GFX11-NEXT: s_and_b32 s10, s4, 4 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 2 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 8 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 3 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 16 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 4 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 32 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 5 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 64 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 6 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x80 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 7 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x100 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 8 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x200 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 9 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x400 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 10 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x800 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 11 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x1000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 12 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x2000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 13 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x4000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 14 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x8000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 15 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x10000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 16 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x20000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 17 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x40000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 18 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x80000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 19 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x100000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 20 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x200000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 21 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x400000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 22 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x800000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 23 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x1000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 24 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x2000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 25 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x4000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 26 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x8000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 27 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x10000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 28 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 0x20000000 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 29 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s13, 0, s15 +; GFX11-NEXT: s_cselect_b32 s12, 0, s14 +; GFX11-NEXT: s_and_b32 s10, s4, 2.0 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], 30 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s11, 0, s15 +; GFX11-NEXT: s_cselect_b32 s10, 0, s14 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_cmp_gt_i32 s4, -1 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_cselect_b32 s3, 0, s3 +; GFX11-NEXT: s_cselect_b32 s2, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_clmulh_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s3 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s11, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s21, s3 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_mov_b32 s25, s3 +; GFX12-NEXT: s_mov_b32 s27, s3 +; GFX12-NEXT: s_mov_b32 s29, s3 +; GFX12-NEXT: s_mov_b32 s31, s3 +; GFX12-NEXT: s_mov_b32 s35, s3 +; GFX12-NEXT: s_mov_b32 s37, s3 +; GFX12-NEXT: s_mov_b32 s39, s3 +; GFX12-NEXT: s_mov_b32 s41, s3 +; GFX12-NEXT: s_mov_b32 s43, s3 +; GFX12-NEXT: s_mov_b32 s45, s3 +; GFX12-NEXT: s_mov_b32 s47, s3 +; GFX12-NEXT: s_mov_b32 s49, s3 +; GFX12-NEXT: s_mov_b32 s51, s3 +; GFX12-NEXT: s_mov_b32 s53, s3 +; GFX12-NEXT: s_mov_b32 s55, s3 +; GFX12-NEXT: s_mov_b32 s57, s3 +; GFX12-NEXT: s_mov_b32 s59, s3 +; GFX12-NEXT: s_mov_b32 s61, s3 +; GFX12-NEXT: s_mov_b32 s63, s3 +; GFX12-NEXT: s_mov_b32 s65, s3 +; GFX12-NEXT: s_mov_b32 s67, s3 +; GFX12-NEXT: s_mov_b32 s69, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s33, v1 +; GFX12-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-NEXT: s_and_b32 s4, s33, 2 +; GFX12-NEXT: s_and_b32 s8, s33, 1 +; GFX12-NEXT: s_and_b32 s10, s33, 4 +; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[8:9] +; GFX12-NEXT: s_and_b32 s12, s33, 8 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11] +; GFX12-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX12-NEXT: s_and_b32 s14, s33, 16 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s16, s33, 32 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s18, s33, 64 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[16:17] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s20, s33, 0x80 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[18:19] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s22, s33, 0x100 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s24, s33, 0x200 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s26, s33, 0x400 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[24:25] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s28, s33, 0x800 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[26:27] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s30, s33, 0x1000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s34, s33, 0x2000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s36, s33, 0x4000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[34:35] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s38, s33, 0x8000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[36:37] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s40, s33, 0x10000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s42, s33, 0x20000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s44, s33, 0x40000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[42:43] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s46, s33, 0x80000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[44:45] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s48, s33, 0x100000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s50, s33, 0x200000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s52, s33, 0x400000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[50:51] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s54, s33, 0x800000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[52:53] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s56, s33, 0x1000000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s58, s33, 0x2000000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s60, s33, 0x4000000 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[58:59] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_and_b32 s62, s33, 0x8000000 +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[60:61] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_and_b32 s64, s33, 0x10000000 +; GFX12-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_and_b32 s66, s33, 0x20000000 +; GFX12-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[12:13] +; GFX12-NEXT: s_and_b32 s68, s33, 2.0 +; GFX12-NEXT: s_mul_u64 s[8:9], s[2:3], s[66:67] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] +; GFX12-NEXT: s_mul_u64 s[10:11], s[2:3], s[68:69] +; GFX12-NEXT: s_and_b32 s12, s33, 0x80000000 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[12:13] +; GFX12-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: test_clmulh_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b64 s[8:9], 0x80000000 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-NEXT: s_and_b64 s[10:11], s[4:5], 2 +; GFX1250-NEXT: s_and_b64 s[12:13], s[4:5], 1 +; GFX1250-NEXT: s_and_b64 s[14:15], s[4:5], 4 +; GFX1250-NEXT: s_mul_u64 s[10:11], s[2:3], s[10:11] +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[12:13] +; GFX1250-NEXT: s_and_b64 s[16:17], s[4:5], 8 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[14:15] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[12:13], s[10:11] +; GFX1250-NEXT: s_and_b64 s[18:19], s[4:5], 16 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[16:17] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[20:21], s[4:5], 32 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[18:19] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[22:23], s[4:5], 64 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[20:21] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[24:25], s[4:5], 0x80 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[22:23] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[26:27], s[4:5], 0x100 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[24:25] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[28:29], s[4:5], 0x200 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[26:27] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[30:31], s[4:5], 0x400 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[28:29] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[34:35], s[4:5], 0x800 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[30:31] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[36:37], s[4:5], 0x1000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[34:35] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[38:39], s[4:5], 0x2000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[36:37] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[40:41], s[4:5], 0x4000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[38:39] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[42:43], s[4:5], 0x8000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[40:41] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[44:45], s[4:5], 0x10000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[42:43] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[46:47], s[4:5], 0x20000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[44:45] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[48:49], s[4:5], 0x40000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[46:47] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[50:51], s[4:5], 0x80000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[48:49] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[52:53], s[4:5], 0x100000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[50:51] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[54:55], s[4:5], 0x200000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[52:53] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[56:57], s[4:5], 0x400000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[54:55] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[58:59], s[4:5], 0x800000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[56:57] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[60:61], s[4:5], 0x1000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[58:59] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[62:63], s[4:5], 0x2000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[60:61] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[64:65], s[4:5], 0x4000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[62:63] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[66:67], s[4:5], 0x8000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[64:65] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[68:69], s[4:5], 0x10000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[66:67] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_and_b64 s[70:71], s[4:5], 0x20000000 +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[68:69] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[72:73], s[4:5], 0x40000000 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[2:3], s[70:71] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[14:15] +; GFX1250-NEXT: s_mul_u64 s[14:15], s[2:3], s[72:73] +; GFX1250-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX1250-NEXT: s_xor_b64 s[8:9], s[10:11], s[14:15] +; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_xor_b64 s[2:3], s[8:9], s[2:3] +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; +; EG-LABEL: test_clmulh_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 98, @11, KC0[], KC1[] +; EG-NEXT: ALU 110, @110, KC0[], KC1[] +; EG-NEXT: ALU 13, @221, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: LSHR * T0.W, T0.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, PV.W, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T0.Y, literal.y, +; EG-NEXT: 65280(9.147676e-41), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.X, PS, literal.x, +; EG-NEXT: LSHR T1.Y, T0.Y, literal.y, +; EG-NEXT: LSHL T1.Z, PV.W, literal.z, +; EG-NEXT: LSHL T0.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 65280(9.147676e-41), 24(3.363116e-44) +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, T0.X, literal.x, +; EG-NEXT: LSHL T0.Y, PS, literal.y, +; EG-NEXT: LSHL T2.Z, T0.X, literal.x, +; EG-NEXT: OR_INT T0.W, PV.W, PV.Z, +; EG-NEXT: OR_INT * T1.W, PV.X, PV.Y, +; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) +; EG-NEXT: OR_INT T1.Z, PV.W, PS, +; EG-NEXT: OR_INT T0.W, PV.Z, PV.Y, +; EG-NEXT: OR_INT * T1.W, T0.Z, PV.X, +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, PV.W, literal.y, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.Z, PS, literal.x, +; EG-NEXT: LSHL T0.W, PV.W, literal.y, +; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T1.Z, PS, literal.x, +; EG-NEXT: LSHR T1.W, PS, literal.y, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHR T0.Z, PS, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: LSHL * T1.W, PV.Z, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT T1.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHL * T1.W, PV.Y, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT T0.Z, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, PV.W, 1, +; EG-NEXT: AND_INT T0.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T1.W, PV.Z, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, PS, literal.x, +; EG-NEXT: LSHL T0.W, PV.W, 1, +; EG-NEXT: OR_INT * T1.W, PV.Y, PV.Z, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PS, 1, +; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: AND_INT T3.W, T1.W, literal.x, +; EG-NEXT: MULLO_INT * T0.X, PS, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, T1.W, literal.x, +; EG-NEXT: MULLO_INT * T0.Y, T0.W, PV.W, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, T0.X, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T2.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 512(7.174648e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.Z, T1.W, literal.x, +; EG-NEXT: 1024(1.434930e-42), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 110: +; EG-NEXT: XOR_INT T3.W, T2.W, T0.X, +; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.Z, +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, T1.Z, +; EG-NEXT: 2048(2.869859e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 4096(5.739719e-42), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16384(2.295887e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 32768(4.591775e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 65536(9.183550e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 131072(1.836710e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 262144(3.673420e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 524288(7.346840e-40), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 1048576(1.469368e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 2097152(2.938736e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 4194304(5.877472e-39), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 16777216(2.350989e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T3.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 33554432(9.403955e-38), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 67108864(1.504633e-36), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 134217728(3.851860e-34), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 268435456(2.524355e-29), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 536870912(1.084202e-19), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T4.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.Z, T1.W, literal.x, +; EG-NEXT: XOR_INT T1.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00) +; EG-NEXT: XOR_INT T1.W, PV.W, PS, +; EG-NEXT: MULLO_INT * T0.X, T0.W, PV.Z, +; EG-NEXT: LSHR T0.Z, T3.W, literal.x, +; EG-NEXT: XOR_INT T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T3.W, literal.y, +; EG-NEXT: 8(1.121039e-44), 65280(9.147676e-41) +; EG-NEXT: LSHL T0.Y, PS, literal.x, +; EG-NEXT: LSHL T1.Z, T2.W, literal.y, +; EG-NEXT: LSHR T0.W, PV.W, literal.y, +; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z, +; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) +; EG-NEXT: 65280(9.147676e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.W, PS, PV.W, +; EG-NEXT: OR_INT * T1.W, PV.Z, PV.Y, +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: AND_INT T0.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, literal.y, +; EG-NEXT: 252645135(7.053345e-30), 4(5.605194e-45) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: ALU clause starting at 221: +; EG-NEXT: AND_INT T0.W, T0.W, literal.x, +; EG-NEXT: LSHL * T1.W, T1.W, literal.y, +; EG-NEXT: 858993459(4.172325e-08), 2(2.802597e-45) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: LSHR * T0.W, PV.W, 1, +; EG-NEXT: 1431655765(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, 1, +; EG-NEXT: 1431655764(1.466015e+13), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: LSHR T0.X, PV.W, 1, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 + %a = load i32, ptr addrspace(1) %in + %b = load i32, ptr addrspace(1) %b_ptr + %a.ext = zext i32 %a to i64 + %b.ext = zext i32 %b to i64 + %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext) + %res.ext = lshr i64 %clmul, 32 + %res = trunc i64 %res.ext to i32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone}