Teach `DAGCombiner::reduceLoadWidth` to look through freeze SDNodes when narrowing loads. The narrowed result is then wrapped in freeze to preserve the original semantics. Currently, several folds were blocked by the freeze: ``` and(freeze(load), 0xff) -> AssertZext(freeze(zextload, i8)) trunc(freeze(load i32), i8) -> freeze(load i8) sext_inreg(freeze(load), i8) -> AssertSext(freeze(sextload, i8)) ``` and many other patterns due to legalizer or upstream IR passes inserting freeze. This generally has the positive effects of narrowing the load type.
4066 lines
146 KiB
LLVM
4066 lines
146 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s -check-prefixes=CHECK,AVX,AVX512,AVX512F
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=CHECK,AVX,AVX512,AVX512VL
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX512,AVX512VBMI
|
|
|
|
define i256 @bext_i256(i256 %a0, i256 %idx, i256 %len) nounwind {
|
|
; SSE-LABEL: bext_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %r15
|
|
; SSE-NEXT: pushq %r14
|
|
; SSE-NEXT: pushq %r12
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: pushq %rax
|
|
; SSE-NEXT: movq %rcx, %rax
|
|
; SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %ecx, %r10d
|
|
; SSE-NEXT: shrb $3, %r10b
|
|
; SSE-NEXT: andb $24, %r10b
|
|
; SSE-NEXT: negb %r10b
|
|
; SSE-NEXT: movsbq %r10b, %rbx
|
|
; SSE-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; SSE-NEXT: movq -8(%rsp,%rbx), %r10
|
|
; SSE-NEXT: shldq %cl, %r11, %r10
|
|
; SSE-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; SSE-NEXT: shldq %cl, %r14, %r11
|
|
; SSE-NEXT: movq -32(%rsp,%rbx), %rbx
|
|
; SSE-NEXT: shldq %cl, %rbx, %r14
|
|
; SSE-NEXT: shlq %cl, %rbx
|
|
; SSE-NEXT: addq $-1, %rbx
|
|
; SSE-NEXT: adcq $-1, %r14
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %r9d, %eax
|
|
; SSE-NEXT: shrb $6, %al
|
|
; SSE-NEXT: movzbl %al, %esi
|
|
; SSE-NEXT: movq -112(%rsp,%rsi,8), %rdx
|
|
; SSE-NEXT: movq -120(%rsp,%rsi,8), %r8
|
|
; SSE-NEXT: movq %r8, %r15
|
|
; SSE-NEXT: movl %r9d, %ecx
|
|
; SSE-NEXT: shrdq %cl, %rdx, %r15
|
|
; SSE-NEXT: movq -104(%rsp,%rsi,8), %r12
|
|
; SSE-NEXT: shrdq %cl, %r12, %rdx
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq -128(%rsp,%rsi,8), %rsi
|
|
; SSE-NEXT: andq %r14, %r15
|
|
; SSE-NEXT: shrdq %cl, %r8, %rsi
|
|
; SSE-NEXT: shrq %cl, %r12
|
|
; SSE-NEXT: andq %r11, %rdx
|
|
; SSE-NEXT: andq %rbx, %rsi
|
|
; SSE-NEXT: andq %r10, %r12
|
|
; SSE-NEXT: movq %rsi, (%rdi)
|
|
; SSE-NEXT: movq %r15, 8(%rdi)
|
|
; SSE-NEXT: movq %rdx, 16(%rdi)
|
|
; SSE-NEXT: movq %r12, 24(%rdi)
|
|
; SSE-NEXT: addq $8, %rsp
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: popq %r12
|
|
; SSE-NEXT: popq %r14
|
|
; SSE-NEXT: popq %r15
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bext_i256:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %r15
|
|
; AVX2-NEXT: pushq %r14
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: movq %rcx, %rax
|
|
; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
|
|
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %r10d, %ecx
|
|
; AVX2-NEXT: shrb $3, %cl
|
|
; AVX2-NEXT: andb $24, %cl
|
|
; AVX2-NEXT: negb %cl
|
|
; AVX2-NEXT: movsbq %cl, %r14
|
|
; AVX2-NEXT: movq -16(%rsp,%r14), %rbx
|
|
; AVX2-NEXT: movq -8(%rsp,%r14), %r11
|
|
; AVX2-NEXT: movl %r10d, %ecx
|
|
; AVX2-NEXT: shldq %cl, %rbx, %r11
|
|
; AVX2-NEXT: movq -32(%rsp,%r14), %r15
|
|
; AVX2-NEXT: movq -24(%rsp,%r14), %r14
|
|
; AVX2-NEXT: shldq %cl, %r14, %rbx
|
|
; AVX2-NEXT: shldq %cl, %r15, %r14
|
|
; AVX2-NEXT: movq %r9, %rcx
|
|
; AVX2-NEXT: shlxq %r10, %r15, %r9
|
|
; AVX2-NEXT: addq $-1, %r9
|
|
; AVX2-NEXT: adcq $-1, %r14
|
|
; AVX2-NEXT: adcq $-1, %rbx
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %edx
|
|
; AVX2-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX2-NEXT: movq -120(%rsp,%rdx,8), %r8
|
|
; AVX2-NEXT: movq %r8, %r10
|
|
; AVX2-NEXT: shrdq %cl, %rsi, %r10
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -128(%rsp,%rdx,8), %rdi
|
|
; AVX2-NEXT: andq %r14, %r10
|
|
; AVX2-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX2-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX2-NEXT: shrdq %cl, %r8, %rdi
|
|
; AVX2-NEXT: andq %rbx, %rsi
|
|
; AVX2-NEXT: andq %r9, %rdi
|
|
; AVX2-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX2-NEXT: andq %r11, %rcx
|
|
; AVX2-NEXT: movq %rdi, (%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %rsi, 16(%rax)
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: popq %r14
|
|
; AVX2-NEXT: popq %r15
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bext_i256:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %r15
|
|
; AVX512F-NEXT: pushq %r14
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: movq %rcx, %rax
|
|
; AVX512F-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %r10d
|
|
; AVX512F-NEXT: shrb $3, %r10b
|
|
; AVX512F-NEXT: andb $24, %r10b
|
|
; AVX512F-NEXT: negb %r10b
|
|
; AVX512F-NEXT: movsbq %r10b, %rbx
|
|
; AVX512F-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512F-NEXT: movq -8(%rsp,%rbx), %r10
|
|
; AVX512F-NEXT: shldq %cl, %r11, %r10
|
|
; AVX512F-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512F-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512F-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512F-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512F-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512F-NEXT: addq $-1, %rbx
|
|
; AVX512F-NEXT: adcq $-1, %r14
|
|
; AVX512F-NEXT: adcq $-1, %r11
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: movq %r9, %rcx
|
|
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %edx
|
|
; AVX512F-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512F-NEXT: movq -120(%rsp,%rdx,8), %r8
|
|
; AVX512F-NEXT: movq %r8, %r9
|
|
; AVX512F-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: movq -128(%rsp,%rdx,8), %rdi
|
|
; AVX512F-NEXT: andq %r14, %r9
|
|
; AVX512F-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512F-NEXT: shrdq %cl, %r8, %rdi
|
|
; AVX512F-NEXT: andq %r11, %rsi
|
|
; AVX512F-NEXT: andq %rbx, %rdi
|
|
; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX512F-NEXT: andq %r10, %rcx
|
|
; AVX512F-NEXT: movq %rdi, (%rax)
|
|
; AVX512F-NEXT: movq %r9, 8(%rax)
|
|
; AVX512F-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: popq %r14
|
|
; AVX512F-NEXT: popq %r15
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bext_i256:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %r15
|
|
; AVX512VL-NEXT: pushq %r14
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %rcx, %r10
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %ecx, %edi
|
|
; AVX512VL-NEXT: shrb $3, %dil
|
|
; AVX512VL-NEXT: andb $24, %dil
|
|
; AVX512VL-NEXT: negb %dil
|
|
; AVX512VL-NEXT: movsbq %dil, %rbx
|
|
; AVX512VL-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VL-NEXT: movq -8(%rsp,%rbx), %rdi
|
|
; AVX512VL-NEXT: shldq %cl, %r11, %rdi
|
|
; AVX512VL-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512VL-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512VL-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512VL-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512VL-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512VL-NEXT: addq $-1, %rbx
|
|
; AVX512VL-NEXT: adcq $-1, %r14
|
|
; AVX512VL-NEXT: adcq $-1, %r11
|
|
; AVX512VL-NEXT: adcq $-1, %rdi
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shrb $6, %cl
|
|
; AVX512VL-NEXT: movzbl %cl, %edx
|
|
; AVX512VL-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512VL-NEXT: movq -128(%rsp,%rdx,8), %r8
|
|
; AVX512VL-NEXT: movq -120(%rsp,%rdx,8), %r10
|
|
; AVX512VL-NEXT: movq %r10, %r15
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shrdq %cl, %rsi, %r15
|
|
; AVX512VL-NEXT: andq %r14, %r15
|
|
; AVX512VL-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512VL-NEXT: andq %r11, %rsi
|
|
; AVX512VL-NEXT: shrdq %cl, %r10, %r8
|
|
; AVX512VL-NEXT: andq %rbx, %r8
|
|
; AVX512VL-NEXT: shrxq %r9, %rdx, %rcx
|
|
; AVX512VL-NEXT: andq %rdi, %rcx
|
|
; AVX512VL-NEXT: movq %r8, (%rax)
|
|
; AVX512VL-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VL-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: popq %r14
|
|
; AVX512VL-NEXT: popq %r15
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bext_i256:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %r15
|
|
; AVX512VBMI-NEXT: pushq %r14
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %rcx, %r10
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %ecx, %edi
|
|
; AVX512VBMI-NEXT: shrb $3, %dil
|
|
; AVX512VBMI-NEXT: andb $24, %dil
|
|
; AVX512VBMI-NEXT: negb %dil
|
|
; AVX512VBMI-NEXT: movsbq %dil, %rbx
|
|
; AVX512VBMI-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VBMI-NEXT: movq -8(%rsp,%rbx), %rdi
|
|
; AVX512VBMI-NEXT: shldq %cl, %r11, %rdi
|
|
; AVX512VBMI-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512VBMI-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512VBMI-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512VBMI-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512VBMI-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512VBMI-NEXT: addq $-1, %rbx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r14
|
|
; AVX512VBMI-NEXT: adcq $-1, %r11
|
|
; AVX512VBMI-NEXT: adcq $-1, %rdi
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shrb $6, %cl
|
|
; AVX512VBMI-NEXT: movzbl %cl, %edx
|
|
; AVX512VBMI-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512VBMI-NEXT: movq -128(%rsp,%rdx,8), %r8
|
|
; AVX512VBMI-NEXT: movq -120(%rsp,%rdx,8), %r10
|
|
; AVX512VBMI-NEXT: movq %r10, %r15
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r15
|
|
; AVX512VBMI-NEXT: andq %r14, %r15
|
|
; AVX512VBMI-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512VBMI-NEXT: andq %r11, %rsi
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
|
|
; AVX512VBMI-NEXT: andq %rbx, %r8
|
|
; AVX512VBMI-NEXT: shrxq %r9, %rdx, %rcx
|
|
; AVX512VBMI-NEXT: andq %rdi, %rcx
|
|
; AVX512VBMI-NEXT: movq %r8, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: popq %r14
|
|
; AVX512VBMI-NEXT: popq %r15
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%bit = shl i256 1, %len
|
|
%msk = sub i256 %bit, 1
|
|
%shift = lshr i256 %a0, %idx
|
|
%res = and i256 %shift, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bext_i256_vector(<4 x i64> %v0, i256 %idx, i256 %len) nounwind {
|
|
; SSE-LABEL: bext_i256_vector:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %r15
|
|
; SSE-NEXT: pushq %r14
|
|
; SSE-NEXT: pushq %r12
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: pushq %rax
|
|
; SSE-NEXT: movq %r9, %rcx
|
|
; SSE-NEXT: xorps %xmm2, %xmm2
|
|
; SSE-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %ecx, %eax
|
|
; SSE-NEXT: shrb $3, %al
|
|
; SSE-NEXT: andb $24, %al
|
|
; SSE-NEXT: negb %al
|
|
; SSE-NEXT: movsbq %al, %rax
|
|
; SSE-NEXT: movq -80(%rsp,%rax), %r8
|
|
; SSE-NEXT: movq -72(%rsp,%rax), %rdx
|
|
; SSE-NEXT: shldq %cl, %r8, %rdx
|
|
; SSE-NEXT: movq -88(%rsp,%rax), %r10
|
|
; SSE-NEXT: shldq %cl, %r10, %r8
|
|
; SSE-NEXT: movq -96(%rsp,%rax), %r9
|
|
; SSE-NEXT: shldq %cl, %r9, %r10
|
|
; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE-NEXT: shlq %cl, %r9
|
|
; SSE-NEXT: addq $-1, %r9
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: adcq $-1, %r8
|
|
; SSE-NEXT: adcq $-1, %rdx
|
|
; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %esi, %eax
|
|
; SSE-NEXT: shrb $6, %al
|
|
; SSE-NEXT: movzbl %al, %ebx
|
|
; SSE-NEXT: movq -48(%rsp,%rbx,8), %r11
|
|
; SSE-NEXT: movq -56(%rsp,%rbx,8), %r14
|
|
; SSE-NEXT: movq %r14, %r15
|
|
; SSE-NEXT: movl %esi, %ecx
|
|
; SSE-NEXT: shrdq %cl, %r11, %r15
|
|
; SSE-NEXT: movq -40(%rsp,%rbx,8), %r12
|
|
; SSE-NEXT: shrdq %cl, %r12, %r11
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq -64(%rsp,%rbx,8), %rdi
|
|
; SSE-NEXT: andq %r10, %r15
|
|
; SSE-NEXT: shrdq %cl, %r14, %rdi
|
|
; SSE-NEXT: shrq %cl, %r12
|
|
; SSE-NEXT: andq %r8, %r11
|
|
; SSE-NEXT: andq %r9, %rdi
|
|
; SSE-NEXT: andq %rdx, %r12
|
|
; SSE-NEXT: movq %rdi, (%rax)
|
|
; SSE-NEXT: movq %r15, 8(%rax)
|
|
; SSE-NEXT: movq %r11, 16(%rax)
|
|
; SSE-NEXT: movq %r12, 24(%rax)
|
|
; SSE-NEXT: addq $8, %rsp
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: popq %r12
|
|
; SSE-NEXT: popq %r14
|
|
; SSE-NEXT: popq %r15
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bext_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %r15
|
|
; AVX2-NEXT: pushq %r14
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: movq %r9, %rcx
|
|
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm2 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $3, %al
|
|
; AVX2-NEXT: andb $24, %al
|
|
; AVX2-NEXT: negb %al
|
|
; AVX2-NEXT: movsbq %al, %rax
|
|
; AVX2-NEXT: movq -80(%rsp,%rax), %r8
|
|
; AVX2-NEXT: movq -72(%rsp,%rax), %rdx
|
|
; AVX2-NEXT: shldq %cl, %r8, %rdx
|
|
; AVX2-NEXT: movq -96(%rsp,%rax), %r9
|
|
; AVX2-NEXT: movq -88(%rsp,%rax), %r10
|
|
; AVX2-NEXT: shldq %cl, %r10, %r8
|
|
; AVX2-NEXT: shldq %cl, %r9, %r10
|
|
; AVX2-NEXT: shlxq %rcx, %r9, %r9
|
|
; AVX2-NEXT: addq $-1, %r9
|
|
; AVX2-NEXT: adcq $-1, %r10
|
|
; AVX2-NEXT: adcq $-1, %r8
|
|
; AVX2-NEXT: adcq $-1, %rdx
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %esi, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %r11d
|
|
; AVX2-NEXT: movq -48(%rsp,%r11,8), %rbx
|
|
; AVX2-NEXT: movq -56(%rsp,%r11,8), %r14
|
|
; AVX2-NEXT: movq %r14, %r15
|
|
; AVX2-NEXT: movl %esi, %ecx
|
|
; AVX2-NEXT: shrdq %cl, %rbx, %r15
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -64(%rsp,%r11,8), %rdi
|
|
; AVX2-NEXT: andq %r10, %r15
|
|
; AVX2-NEXT: movq -40(%rsp,%r11,8), %r10
|
|
; AVX2-NEXT: shrdq %cl, %r10, %rbx
|
|
; AVX2-NEXT: shrdq %cl, %r14, %rdi
|
|
; AVX2-NEXT: andq %r8, %rbx
|
|
; AVX2-NEXT: andq %r9, %rdi
|
|
; AVX2-NEXT: shrxq %rsi, %r10, %rcx
|
|
; AVX2-NEXT: andq %rdx, %rcx
|
|
; AVX2-NEXT: movq %rdi, (%rax)
|
|
; AVX2-NEXT: movq %r15, 8(%rax)
|
|
; AVX2-NEXT: movq %rbx, 16(%rax)
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: popq %r14
|
|
; AVX2-NEXT: popq %r15
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bext_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %r14
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: pushq %rax
|
|
; AVX512F-NEXT: movq %r9, %rcx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $3, %al
|
|
; AVX512F-NEXT: andb $24, %al
|
|
; AVX512F-NEXT: negb %al
|
|
; AVX512F-NEXT: movsbq %al, %rax
|
|
; AVX512F-NEXT: movq -16(%rsp,%rax), %r8
|
|
; AVX512F-NEXT: movq -8(%rsp,%rax), %rdx
|
|
; AVX512F-NEXT: shldq %cl, %r8, %rdx
|
|
; AVX512F-NEXT: movq -32(%rsp,%rax), %r9
|
|
; AVX512F-NEXT: movq -24(%rsp,%rax), %r10
|
|
; AVX512F-NEXT: shldq %cl, %r10, %r8
|
|
; AVX512F-NEXT: shldq %cl, %r9, %r10
|
|
; AVX512F-NEXT: shlxq %rcx, %r9, %r9
|
|
; AVX512F-NEXT: addq $-1, %r9
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: adcq $-1, %r8
|
|
; AVX512F-NEXT: movq %rsi, %rcx
|
|
; AVX512F-NEXT: adcq $-1, %rdx
|
|
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX512F-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %esi
|
|
; AVX512F-NEXT: movq -112(%rsp,%rsi,8), %r11
|
|
; AVX512F-NEXT: movq -120(%rsp,%rsi,8), %rbx
|
|
; AVX512F-NEXT: movq %rbx, %r14
|
|
; AVX512F-NEXT: shrdq %cl, %r11, %r14
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rdi
|
|
; AVX512F-NEXT: andq %r10, %r14
|
|
; AVX512F-NEXT: movq -104(%rsp,%rsi,8), %rsi
|
|
; AVX512F-NEXT: shrdq %cl, %rsi, %r11
|
|
; AVX512F-NEXT: shrdq %cl, %rbx, %rdi
|
|
; AVX512F-NEXT: andq %r8, %r11
|
|
; AVX512F-NEXT: andq %r9, %rdi
|
|
; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512F-NEXT: andq %rdx, %rcx
|
|
; AVX512F-NEXT: movq %rdi, (%rax)
|
|
; AVX512F-NEXT: movq %r14, 8(%rax)
|
|
; AVX512F-NEXT: movq %r11, 16(%rax)
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: addq $8, %rsp
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: popq %r14
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bext_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %r15
|
|
; AVX512VL-NEXT: pushq %r14
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %r9, %rcx
|
|
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm2 = [1,0,0,0]
|
|
; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %ecx, %eax
|
|
; AVX512VL-NEXT: shrb $3, %al
|
|
; AVX512VL-NEXT: andb $24, %al
|
|
; AVX512VL-NEXT: negb %al
|
|
; AVX512VL-NEXT: movsbq %al, %r10
|
|
; AVX512VL-NEXT: movq -80(%rsp,%r10), %r8
|
|
; AVX512VL-NEXT: movq -72(%rsp,%r10), %rdx
|
|
; AVX512VL-NEXT: shldq %cl, %r8, %rdx
|
|
; AVX512VL-NEXT: movq -88(%rsp,%r10), %r9
|
|
; AVX512VL-NEXT: shldq %cl, %r9, %r8
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: movq -96(%rsp,%r10), %rdi
|
|
; AVX512VL-NEXT: shldq %cl, %rdi, %r9
|
|
; AVX512VL-NEXT: shlxq %rcx, %rdi, %rdi
|
|
; AVX512VL-NEXT: addq $-1, %rdi
|
|
; AVX512VL-NEXT: adcq $-1, %r9
|
|
; AVX512VL-NEXT: adcq $-1, %r8
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: adcq $-1, %rdx
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %esi, %ecx
|
|
; AVX512VL-NEXT: shrb $6, %cl
|
|
; AVX512VL-NEXT: movzbl %cl, %r10d
|
|
; AVX512VL-NEXT: movq -48(%rsp,%r10,8), %r11
|
|
; AVX512VL-NEXT: movq -64(%rsp,%r10,8), %rbx
|
|
; AVX512VL-NEXT: movq -56(%rsp,%r10,8), %r14
|
|
; AVX512VL-NEXT: movq %r14, %r15
|
|
; AVX512VL-NEXT: movl %esi, %ecx
|
|
; AVX512VL-NEXT: shrdq %cl, %r11, %r15
|
|
; AVX512VL-NEXT: andq %r9, %r15
|
|
; AVX512VL-NEXT: movq -40(%rsp,%r10,8), %r9
|
|
; AVX512VL-NEXT: shrdq %cl, %r9, %r11
|
|
; AVX512VL-NEXT: andq %r8, %r11
|
|
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
|
|
; AVX512VL-NEXT: andq %rdi, %rbx
|
|
; AVX512VL-NEXT: shrxq %rsi, %r9, %rcx
|
|
; AVX512VL-NEXT: andq %rdx, %rcx
|
|
; AVX512VL-NEXT: movq %rbx, (%rax)
|
|
; AVX512VL-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r11, 16(%rax)
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: popq %r14
|
|
; AVX512VL-NEXT: popq %r15
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bext_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %r15
|
|
; AVX512VBMI-NEXT: pushq %r14
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %r9, %rcx
|
|
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm2 = [1,0,0,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %ecx, %eax
|
|
; AVX512VBMI-NEXT: shrb $3, %al
|
|
; AVX512VBMI-NEXT: andb $24, %al
|
|
; AVX512VBMI-NEXT: negb %al
|
|
; AVX512VBMI-NEXT: movsbq %al, %r10
|
|
; AVX512VBMI-NEXT: movq -80(%rsp,%r10), %r8
|
|
; AVX512VBMI-NEXT: movq -72(%rsp,%r10), %rdx
|
|
; AVX512VBMI-NEXT: shldq %cl, %r8, %rdx
|
|
; AVX512VBMI-NEXT: movq -88(%rsp,%r10), %r9
|
|
; AVX512VBMI-NEXT: shldq %cl, %r9, %r8
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: movq -96(%rsp,%r10), %rdi
|
|
; AVX512VBMI-NEXT: shldq %cl, %rdi, %r9
|
|
; AVX512VBMI-NEXT: shlxq %rcx, %rdi, %rdi
|
|
; AVX512VBMI-NEXT: addq $-1, %rdi
|
|
; AVX512VBMI-NEXT: adcq $-1, %r9
|
|
; AVX512VBMI-NEXT: adcq $-1, %r8
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: adcq $-1, %rdx
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %esi, %ecx
|
|
; AVX512VBMI-NEXT: shrb $6, %cl
|
|
; AVX512VBMI-NEXT: movzbl %cl, %r10d
|
|
; AVX512VBMI-NEXT: movq -48(%rsp,%r10,8), %r11
|
|
; AVX512VBMI-NEXT: movq -64(%rsp,%r10,8), %rbx
|
|
; AVX512VBMI-NEXT: movq -56(%rsp,%r10,8), %r14
|
|
; AVX512VBMI-NEXT: movq %r14, %r15
|
|
; AVX512VBMI-NEXT: movl %esi, %ecx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r15
|
|
; AVX512VBMI-NEXT: andq %r9, %r15
|
|
; AVX512VBMI-NEXT: movq -40(%rsp,%r10,8), %r9
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r9, %r11
|
|
; AVX512VBMI-NEXT: andq %r8, %r11
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
|
|
; AVX512VBMI-NEXT: andq %rdi, %rbx
|
|
; AVX512VBMI-NEXT: shrxq %rsi, %r9, %rcx
|
|
; AVX512VBMI-NEXT: andq %rdx, %rcx
|
|
; AVX512VBMI-NEXT: movq %rbx, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r11, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: popq %r14
|
|
; AVX512VBMI-NEXT: popq %r15
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%bit = shl i256 1, %len
|
|
%msk = sub i256 %bit, 1
|
|
%shift = lshr i256 %a0, %idx
|
|
%res = and i256 %shift, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bext_i256_load(i256 %a0, i256 %idx, i256 %len) nounwind {
|
|
; SSE-LABEL: bext_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %r15
|
|
; SSE-NEXT: pushq %r14
|
|
; SSE-NEXT: pushq %r12
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: pushq %rax
|
|
; SSE-NEXT: movq %rcx, %rax
|
|
; SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %ecx, %r10d
|
|
; SSE-NEXT: shrb $3, %r10b
|
|
; SSE-NEXT: andb $24, %r10b
|
|
; SSE-NEXT: negb %r10b
|
|
; SSE-NEXT: movsbq %r10b, %rbx
|
|
; SSE-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; SSE-NEXT: movq -8(%rsp,%rbx), %r10
|
|
; SSE-NEXT: shldq %cl, %r11, %r10
|
|
; SSE-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; SSE-NEXT: shldq %cl, %r14, %r11
|
|
; SSE-NEXT: movq -32(%rsp,%rbx), %rbx
|
|
; SSE-NEXT: shldq %cl, %rbx, %r14
|
|
; SSE-NEXT: shlq %cl, %rbx
|
|
; SSE-NEXT: addq $-1, %rbx
|
|
; SSE-NEXT: adcq $-1, %r14
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %r9d, %eax
|
|
; SSE-NEXT: shrb $6, %al
|
|
; SSE-NEXT: movzbl %al, %esi
|
|
; SSE-NEXT: movq -112(%rsp,%rsi,8), %rdx
|
|
; SSE-NEXT: movq -120(%rsp,%rsi,8), %r8
|
|
; SSE-NEXT: movq %r8, %r15
|
|
; SSE-NEXT: movl %r9d, %ecx
|
|
; SSE-NEXT: shrdq %cl, %rdx, %r15
|
|
; SSE-NEXT: movq -104(%rsp,%rsi,8), %r12
|
|
; SSE-NEXT: shrdq %cl, %r12, %rdx
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq -128(%rsp,%rsi,8), %rsi
|
|
; SSE-NEXT: andq %r14, %r15
|
|
; SSE-NEXT: shrdq %cl, %r8, %rsi
|
|
; SSE-NEXT: shrq %cl, %r12
|
|
; SSE-NEXT: andq %r11, %rdx
|
|
; SSE-NEXT: andq %rbx, %rsi
|
|
; SSE-NEXT: andq %r10, %r12
|
|
; SSE-NEXT: movq %rsi, (%rdi)
|
|
; SSE-NEXT: movq %r15, 8(%rdi)
|
|
; SSE-NEXT: movq %rdx, 16(%rdi)
|
|
; SSE-NEXT: movq %r12, 24(%rdi)
|
|
; SSE-NEXT: addq $8, %rsp
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: popq %r12
|
|
; SSE-NEXT: popq %r14
|
|
; SSE-NEXT: popq %r15
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bext_i256_load:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %r15
|
|
; AVX2-NEXT: pushq %r14
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: movq %rcx, %rax
|
|
; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
|
|
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %r10d, %ecx
|
|
; AVX2-NEXT: shrb $3, %cl
|
|
; AVX2-NEXT: andb $24, %cl
|
|
; AVX2-NEXT: negb %cl
|
|
; AVX2-NEXT: movsbq %cl, %r14
|
|
; AVX2-NEXT: movq -16(%rsp,%r14), %rbx
|
|
; AVX2-NEXT: movq -8(%rsp,%r14), %r11
|
|
; AVX2-NEXT: movl %r10d, %ecx
|
|
; AVX2-NEXT: shldq %cl, %rbx, %r11
|
|
; AVX2-NEXT: movq -32(%rsp,%r14), %r15
|
|
; AVX2-NEXT: movq -24(%rsp,%r14), %r14
|
|
; AVX2-NEXT: shldq %cl, %r14, %rbx
|
|
; AVX2-NEXT: shldq %cl, %r15, %r14
|
|
; AVX2-NEXT: movq %r9, %rcx
|
|
; AVX2-NEXT: shlxq %r10, %r15, %r9
|
|
; AVX2-NEXT: addq $-1, %r9
|
|
; AVX2-NEXT: adcq $-1, %r14
|
|
; AVX2-NEXT: adcq $-1, %rbx
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %edx
|
|
; AVX2-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX2-NEXT: movq -120(%rsp,%rdx,8), %r8
|
|
; AVX2-NEXT: movq %r8, %r10
|
|
; AVX2-NEXT: shrdq %cl, %rsi, %r10
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -128(%rsp,%rdx,8), %rdi
|
|
; AVX2-NEXT: andq %r14, %r10
|
|
; AVX2-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX2-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX2-NEXT: shrdq %cl, %r8, %rdi
|
|
; AVX2-NEXT: andq %rbx, %rsi
|
|
; AVX2-NEXT: andq %r9, %rdi
|
|
; AVX2-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX2-NEXT: andq %r11, %rcx
|
|
; AVX2-NEXT: movq %rdi, (%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %rsi, 16(%rax)
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: popq %r14
|
|
; AVX2-NEXT: popq %r15
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bext_i256_load:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %r15
|
|
; AVX512F-NEXT: pushq %r14
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: movq %rcx, %rax
|
|
; AVX512F-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %r10d
|
|
; AVX512F-NEXT: shrb $3, %r10b
|
|
; AVX512F-NEXT: andb $24, %r10b
|
|
; AVX512F-NEXT: negb %r10b
|
|
; AVX512F-NEXT: movsbq %r10b, %rbx
|
|
; AVX512F-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512F-NEXT: movq -8(%rsp,%rbx), %r10
|
|
; AVX512F-NEXT: shldq %cl, %r11, %r10
|
|
; AVX512F-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512F-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512F-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512F-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512F-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512F-NEXT: addq $-1, %rbx
|
|
; AVX512F-NEXT: adcq $-1, %r14
|
|
; AVX512F-NEXT: adcq $-1, %r11
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: movq %r9, %rcx
|
|
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %edx
|
|
; AVX512F-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512F-NEXT: movq -120(%rsp,%rdx,8), %r8
|
|
; AVX512F-NEXT: movq %r8, %r9
|
|
; AVX512F-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: movq -128(%rsp,%rdx,8), %rdi
|
|
; AVX512F-NEXT: andq %r14, %r9
|
|
; AVX512F-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512F-NEXT: shrdq %cl, %r8, %rdi
|
|
; AVX512F-NEXT: andq %r11, %rsi
|
|
; AVX512F-NEXT: andq %rbx, %rdi
|
|
; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX512F-NEXT: andq %r10, %rcx
|
|
; AVX512F-NEXT: movq %rdi, (%rax)
|
|
; AVX512F-NEXT: movq %r9, 8(%rax)
|
|
; AVX512F-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: popq %r14
|
|
; AVX512F-NEXT: popq %r15
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bext_i256_load:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %r15
|
|
; AVX512VL-NEXT: pushq %r14
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %rcx, %r10
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %ecx, %edi
|
|
; AVX512VL-NEXT: shrb $3, %dil
|
|
; AVX512VL-NEXT: andb $24, %dil
|
|
; AVX512VL-NEXT: negb %dil
|
|
; AVX512VL-NEXT: movsbq %dil, %rbx
|
|
; AVX512VL-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VL-NEXT: movq -8(%rsp,%rbx), %rdi
|
|
; AVX512VL-NEXT: shldq %cl, %r11, %rdi
|
|
; AVX512VL-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512VL-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512VL-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512VL-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512VL-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512VL-NEXT: addq $-1, %rbx
|
|
; AVX512VL-NEXT: adcq $-1, %r14
|
|
; AVX512VL-NEXT: adcq $-1, %r11
|
|
; AVX512VL-NEXT: adcq $-1, %rdi
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shrb $6, %cl
|
|
; AVX512VL-NEXT: movzbl %cl, %edx
|
|
; AVX512VL-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512VL-NEXT: movq -128(%rsp,%rdx,8), %r8
|
|
; AVX512VL-NEXT: movq -120(%rsp,%rdx,8), %r10
|
|
; AVX512VL-NEXT: movq %r10, %r15
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shrdq %cl, %rsi, %r15
|
|
; AVX512VL-NEXT: andq %r14, %r15
|
|
; AVX512VL-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512VL-NEXT: andq %r11, %rsi
|
|
; AVX512VL-NEXT: shrdq %cl, %r10, %r8
|
|
; AVX512VL-NEXT: andq %rbx, %r8
|
|
; AVX512VL-NEXT: shrxq %r9, %rdx, %rcx
|
|
; AVX512VL-NEXT: andq %rdi, %rcx
|
|
; AVX512VL-NEXT: movq %r8, (%rax)
|
|
; AVX512VL-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VL-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: popq %r14
|
|
; AVX512VL-NEXT: popq %r15
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bext_i256_load:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %r15
|
|
; AVX512VBMI-NEXT: pushq %r14
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %rcx, %r10
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
|
|
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %ecx, %edi
|
|
; AVX512VBMI-NEXT: shrb $3, %dil
|
|
; AVX512VBMI-NEXT: andb $24, %dil
|
|
; AVX512VBMI-NEXT: negb %dil
|
|
; AVX512VBMI-NEXT: movsbq %dil, %rbx
|
|
; AVX512VBMI-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VBMI-NEXT: movq -8(%rsp,%rbx), %rdi
|
|
; AVX512VBMI-NEXT: shldq %cl, %r11, %rdi
|
|
; AVX512VBMI-NEXT: movq -32(%rsp,%rbx), %r15
|
|
; AVX512VBMI-NEXT: movq -24(%rsp,%rbx), %r14
|
|
; AVX512VBMI-NEXT: shldq %cl, %r14, %r11
|
|
; AVX512VBMI-NEXT: shldq %cl, %r15, %r14
|
|
; AVX512VBMI-NEXT: shlxq %rcx, %r15, %rbx
|
|
; AVX512VBMI-NEXT: addq $-1, %rbx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r14
|
|
; AVX512VBMI-NEXT: adcq $-1, %r11
|
|
; AVX512VBMI-NEXT: adcq $-1, %rdi
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shrb $6, %cl
|
|
; AVX512VBMI-NEXT: movzbl %cl, %edx
|
|
; AVX512VBMI-NEXT: movq -112(%rsp,%rdx,8), %rsi
|
|
; AVX512VBMI-NEXT: movq -128(%rsp,%rdx,8), %r8
|
|
; AVX512VBMI-NEXT: movq -120(%rsp,%rdx,8), %r10
|
|
; AVX512VBMI-NEXT: movq %r10, %r15
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r15
|
|
; AVX512VBMI-NEXT: andq %r14, %r15
|
|
; AVX512VBMI-NEXT: movq -104(%rsp,%rdx,8), %rdx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
|
|
; AVX512VBMI-NEXT: andq %r11, %rsi
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
|
|
; AVX512VBMI-NEXT: andq %rbx, %r8
|
|
; AVX512VBMI-NEXT: shrxq %r9, %rdx, %rcx
|
|
; AVX512VBMI-NEXT: andq %rdi, %rcx
|
|
; AVX512VBMI-NEXT: movq %r8, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r15, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: popq %r14
|
|
; AVX512VBMI-NEXT: popq %r15
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%bit = shl i256 1, %len
|
|
%msk = sub i256 %bit, 1
|
|
%shift = lshr i256 %a0, %idx
|
|
%res = and i256 %shift, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsi_i256(i256 %a0) nounwind {
|
|
; SSE-LABEL: blsi_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: xorl %r9d, %r9d
|
|
; SSE-NEXT: movq %rsi, %r10
|
|
; SSE-NEXT: negq %r10
|
|
; SSE-NEXT: movl $0, %r11d
|
|
; SSE-NEXT: sbbq %rdx, %r11
|
|
; SSE-NEXT: movl $0, %ebx
|
|
; SSE-NEXT: sbbq %rcx, %rbx
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: sbbq %r8, %r9
|
|
; SSE-NEXT: andq %r8, %r9
|
|
; SSE-NEXT: andq %rcx, %rbx
|
|
; SSE-NEXT: andq %rdx, %r11
|
|
; SSE-NEXT: andq %rsi, %r10
|
|
; SSE-NEXT: movq %r11, 8(%rdi)
|
|
; SSE-NEXT: movq %r10, (%rdi)
|
|
; SSE-NEXT: movq %rbx, 16(%rdi)
|
|
; SSE-NEXT: movq %r9, 24(%rdi)
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsi_i256:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: xorl %edi, %edi
|
|
; AVX-NEXT: movq %rsi, %r9
|
|
; AVX-NEXT: negq %r9
|
|
; AVX-NEXT: movl $0, %r10d
|
|
; AVX-NEXT: sbbq %rdx, %r10
|
|
; AVX-NEXT: movl $0, %r11d
|
|
; AVX-NEXT: sbbq %rcx, %r11
|
|
; AVX-NEXT: sbbq %r8, %rdi
|
|
; AVX-NEXT: andq %r8, %rdi
|
|
; AVX-NEXT: andq %rcx, %r11
|
|
; AVX-NEXT: andq %rdx, %r10
|
|
; AVX-NEXT: andq %rsi, %r9
|
|
; AVX-NEXT: movq %r10, 8(%rax)
|
|
; AVX-NEXT: movq %r9, (%rax)
|
|
; AVX-NEXT: movq %r11, 16(%rax)
|
|
; AVX-NEXT: movq %rdi, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%neg = sub i256 0, %a0
|
|
%res = and i256 %a0, %neg
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsi_i256_vector(<4 x i64> %v0) nounwind {
|
|
; SSE2-LABEL: blsi_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rcx
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm0, %rdx
|
|
; SSE2-NEXT: xorl %esi, %esi
|
|
; SSE2-NEXT: movq %rdx, %rdi
|
|
; SSE2-NEXT: negq %rdi
|
|
; SSE2-NEXT: movl $0, %r8d
|
|
; SSE2-NEXT: sbbq %rcx, %r8
|
|
; SSE2-NEXT: movq %xmm1, %r9
|
|
; SSE2-NEXT: movl $0, %r10d
|
|
; SSE2-NEXT: sbbq %r9, %r10
|
|
; SSE2-NEXT: movq %xmm2, %r11
|
|
; SSE2-NEXT: sbbq %r11, %rsi
|
|
; SSE2-NEXT: andq %r11, %rsi
|
|
; SSE2-NEXT: andq %r9, %r10
|
|
; SSE2-NEXT: andq %rcx, %r8
|
|
; SSE2-NEXT: andq %rdx, %rdi
|
|
; SSE2-NEXT: movq %rdi, (%rax)
|
|
; SSE2-NEXT: movq %r8, 8(%rax)
|
|
; SSE2-NEXT: movq %r10, 16(%rax)
|
|
; SSE2-NEXT: movq %rsi, 24(%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: blsi_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: pushq %rbx
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %rcx
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
|
|
; SSE42-NEXT: movq %xmm0, %rsi
|
|
; SSE42-NEXT: xorl %r8d, %r8d
|
|
; SSE42-NEXT: movq %rsi, %r9
|
|
; SSE42-NEXT: negq %r9
|
|
; SSE42-NEXT: movl $0, %r10d
|
|
; SSE42-NEXT: sbbq %rdx, %r10
|
|
; SSE42-NEXT: movq %xmm1, %r11
|
|
; SSE42-NEXT: movl $0, %ebx
|
|
; SSE42-NEXT: sbbq %r11, %rbx
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: sbbq %rcx, %r8
|
|
; SSE42-NEXT: andq %rcx, %r8
|
|
; SSE42-NEXT: andq %r11, %rbx
|
|
; SSE42-NEXT: andq %rdx, %r10
|
|
; SSE42-NEXT: andq %rsi, %r9
|
|
; SSE42-NEXT: movq %r9, (%rdi)
|
|
; SSE42-NEXT: movq %r10, 8(%rdi)
|
|
; SSE42-NEXT: movq %rbx, 16(%rdi)
|
|
; SSE42-NEXT: movq %r8, 24(%rdi)
|
|
; SSE42-NEXT: popq %rbx
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: blsi_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
|
|
; AVX2-NEXT: vmovq %xmm1, %rsi
|
|
; AVX2-NEXT: vmovq %xmm0, %rdi
|
|
; AVX2-NEXT: xorl %r8d, %r8d
|
|
; AVX2-NEXT: movq %rdi, %r9
|
|
; AVX2-NEXT: negq %r9
|
|
; AVX2-NEXT: movl $0, %r10d
|
|
; AVX2-NEXT: sbbq %rdx, %r10
|
|
; AVX2-NEXT: movl $0, %r11d
|
|
; AVX2-NEXT: sbbq %rsi, %r11
|
|
; AVX2-NEXT: sbbq %rcx, %r8
|
|
; AVX2-NEXT: andq %rcx, %r8
|
|
; AVX2-NEXT: andq %rsi, %r11
|
|
; AVX2-NEXT: andq %rdx, %r10
|
|
; AVX2-NEXT: andq %rdi, %r9
|
|
; AVX2-NEXT: movq %r9, (%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %r11, 16(%rax)
|
|
; AVX2-NEXT: movq %r8, 24(%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: blsi_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
|
|
; AVX512F-NEXT: vmovq %xmm1, %rsi
|
|
; AVX512F-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512F-NEXT: xorl %r8d, %r8d
|
|
; AVX512F-NEXT: movq %rdi, %r9
|
|
; AVX512F-NEXT: negq %r9
|
|
; AVX512F-NEXT: movl $0, %r10d
|
|
; AVX512F-NEXT: sbbq %rdx, %r10
|
|
; AVX512F-NEXT: movl $0, %r11d
|
|
; AVX512F-NEXT: sbbq %rsi, %r11
|
|
; AVX512F-NEXT: sbbq %rcx, %r8
|
|
; AVX512F-NEXT: andq %rcx, %r8
|
|
; AVX512F-NEXT: andq %rsi, %r11
|
|
; AVX512F-NEXT: andq %rdx, %r10
|
|
; AVX512F-NEXT: andq %rdi, %r9
|
|
; AVX512F-NEXT: movq %r9, (%rax)
|
|
; AVX512F-NEXT: movq %r10, 8(%rax)
|
|
; AVX512F-NEXT: movq %r11, 16(%rax)
|
|
; AVX512F-NEXT: movq %r8, 24(%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: blsi_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VL-NEXT: xorl %r8d, %r8d
|
|
; AVX512VL-NEXT: movq %rdi, %r9
|
|
; AVX512VL-NEXT: negq %r9
|
|
; AVX512VL-NEXT: movl $0, %r10d
|
|
; AVX512VL-NEXT: sbbq %rsi, %r10
|
|
; AVX512VL-NEXT: movl $0, %r11d
|
|
; AVX512VL-NEXT: sbbq %rdx, %r11
|
|
; AVX512VL-NEXT: sbbq %rcx, %r8
|
|
; AVX512VL-NEXT: andq %rcx, %r8
|
|
; AVX512VL-NEXT: andq %rdx, %r11
|
|
; AVX512VL-NEXT: andq %rsi, %r10
|
|
; AVX512VL-NEXT: andq %rdi, %r9
|
|
; AVX512VL-NEXT: movq %r9, (%rax)
|
|
; AVX512VL-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r11, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r8, 24(%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: blsi_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VBMI-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VBMI-NEXT: xorl %r8d, %r8d
|
|
; AVX512VBMI-NEXT: movq %rdi, %r9
|
|
; AVX512VBMI-NEXT: negq %r9
|
|
; AVX512VBMI-NEXT: movl $0, %r10d
|
|
; AVX512VBMI-NEXT: sbbq %rsi, %r10
|
|
; AVX512VBMI-NEXT: movl $0, %r11d
|
|
; AVX512VBMI-NEXT: sbbq %rdx, %r11
|
|
; AVX512VBMI-NEXT: sbbq %rcx, %r8
|
|
; AVX512VBMI-NEXT: andq %rcx, %r8
|
|
; AVX512VBMI-NEXT: andq %rdx, %r11
|
|
; AVX512VBMI-NEXT: andq %rsi, %r10
|
|
; AVX512VBMI-NEXT: andq %rdi, %r9
|
|
; AVX512VBMI-NEXT: movq %r9, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r11, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r8, 24(%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%neg = sub i256 0, %a0
|
|
%res = and i256 %a0, %neg
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsi_i256_load(ptr %p0) nounwind {
|
|
; SSE-LABEL: blsi_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq (%rsi), %rcx
|
|
; SSE-NEXT: movq 8(%rsi), %rdx
|
|
; SSE-NEXT: xorl %edi, %edi
|
|
; SSE-NEXT: movq %rcx, %r8
|
|
; SSE-NEXT: negq %r8
|
|
; SSE-NEXT: movl $0, %r9d
|
|
; SSE-NEXT: sbbq %rdx, %r9
|
|
; SSE-NEXT: movq 16(%rsi), %r10
|
|
; SSE-NEXT: movl $0, %r11d
|
|
; SSE-NEXT: sbbq %r10, %r11
|
|
; SSE-NEXT: movq 24(%rsi), %rsi
|
|
; SSE-NEXT: sbbq %rsi, %rdi
|
|
; SSE-NEXT: andq %rsi, %rdi
|
|
; SSE-NEXT: andq %r10, %r11
|
|
; SSE-NEXT: andq %rdx, %r9
|
|
; SSE-NEXT: andq %rcx, %r8
|
|
; SSE-NEXT: movq %r8, (%rax)
|
|
; SSE-NEXT: movq %r9, 8(%rax)
|
|
; SSE-NEXT: movq %r11, 16(%rax)
|
|
; SSE-NEXT: movq %rdi, 24(%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsi_i256_load:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq 24(%rsi), %rcx
|
|
; AVX-NEXT: movq 16(%rsi), %rdx
|
|
; AVX-NEXT: movq (%rsi), %rdi
|
|
; AVX-NEXT: movq 8(%rsi), %rsi
|
|
; AVX-NEXT: xorl %r8d, %r8d
|
|
; AVX-NEXT: movq %rdi, %r9
|
|
; AVX-NEXT: negq %r9
|
|
; AVX-NEXT: movl $0, %r10d
|
|
; AVX-NEXT: sbbq %rsi, %r10
|
|
; AVX-NEXT: movl $0, %r11d
|
|
; AVX-NEXT: sbbq %rdx, %r11
|
|
; AVX-NEXT: sbbq %rcx, %r8
|
|
; AVX-NEXT: andq %rcx, %r8
|
|
; AVX-NEXT: andq %rdx, %r11
|
|
; AVX-NEXT: andq %rsi, %r10
|
|
; AVX-NEXT: andq %rdi, %r9
|
|
; AVX-NEXT: movq %r9, (%rax)
|
|
; AVX-NEXT: movq %r10, 8(%rax)
|
|
; AVX-NEXT: movq %r11, 16(%rax)
|
|
; AVX-NEXT: movq %r8, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%neg = sub i256 0, %a0
|
|
%res = and i256 %a0, %neg
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsmsk_i256(i256 %a0) nounwind {
|
|
; SSE-LABEL: blsmsk_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rsi, %r9
|
|
; SSE-NEXT: addq $-1, %r9
|
|
; SSE-NEXT: movq %rdx, %r10
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq %rcx, %rdi
|
|
; SSE-NEXT: adcq $-1, %rdi
|
|
; SSE-NEXT: movq %r8, %r11
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: xorq %r8, %r11
|
|
; SSE-NEXT: xorq %rcx, %rdi
|
|
; SSE-NEXT: xorq %rsi, %r9
|
|
; SSE-NEXT: xorq %rdx, %r10
|
|
; SSE-NEXT: movq %r10, 8(%rax)
|
|
; SSE-NEXT: movq %r9, (%rax)
|
|
; SSE-NEXT: movq %rdi, 16(%rax)
|
|
; SSE-NEXT: movq %r11, 24(%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsmsk_i256:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq %rsi, %rdi
|
|
; AVX-NEXT: addq $-1, %rdi
|
|
; AVX-NEXT: movq %rdx, %r9
|
|
; AVX-NEXT: adcq $-1, %r9
|
|
; AVX-NEXT: movq %rcx, %r10
|
|
; AVX-NEXT: adcq $-1, %r10
|
|
; AVX-NEXT: movq %r8, %r11
|
|
; AVX-NEXT: adcq $-1, %r11
|
|
; AVX-NEXT: xorq %r8, %r11
|
|
; AVX-NEXT: xorq %rcx, %r10
|
|
; AVX-NEXT: xorq %rsi, %rdi
|
|
; AVX-NEXT: xorq %rdx, %r9
|
|
; AVX-NEXT: movq %r9, 8(%rax)
|
|
; AVX-NEXT: movq %rdi, (%rax)
|
|
; AVX-NEXT: movq %r10, 16(%rax)
|
|
; AVX-NEXT: movq %r11, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%dec = sub i256 %a0, 1
|
|
%res = xor i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsmsk_i256_vector(<4 x i64> %v0) nounwind {
|
|
; SSE2-LABEL: blsmsk_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rcx
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rdx
|
|
; SSE2-NEXT: movq %xmm0, %rsi
|
|
; SSE2-NEXT: movq %rsi, %rdi
|
|
; SSE2-NEXT: addq $-1, %rdi
|
|
; SSE2-NEXT: movq %rdx, %r8
|
|
; SSE2-NEXT: adcq $-1, %r8
|
|
; SSE2-NEXT: movq %xmm1, %r9
|
|
; SSE2-NEXT: movq %r9, %r10
|
|
; SSE2-NEXT: adcq $-1, %r10
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: adcq $-1, %r11
|
|
; SSE2-NEXT: xorq %rcx, %r11
|
|
; SSE2-NEXT: xorq %r9, %r10
|
|
; SSE2-NEXT: xorq %rdx, %r8
|
|
; SSE2-NEXT: xorq %rsi, %rdi
|
|
; SSE2-NEXT: movq %rdi, (%rax)
|
|
; SSE2-NEXT: movq %r8, 8(%rax)
|
|
; SSE2-NEXT: movq %r10, 16(%rax)
|
|
; SSE2-NEXT: movq %r11, 24(%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: blsmsk_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %rcx
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
|
|
; SSE42-NEXT: movq %xmm0, %rsi
|
|
; SSE42-NEXT: movq %rsi, %rdi
|
|
; SSE42-NEXT: addq $-1, %rdi
|
|
; SSE42-NEXT: movq %rdx, %r8
|
|
; SSE42-NEXT: adcq $-1, %r8
|
|
; SSE42-NEXT: movq %xmm1, %r9
|
|
; SSE42-NEXT: movq %r9, %r10
|
|
; SSE42-NEXT: adcq $-1, %r10
|
|
; SSE42-NEXT: movq %rcx, %r11
|
|
; SSE42-NEXT: adcq $-1, %r11
|
|
; SSE42-NEXT: xorq %rcx, %r11
|
|
; SSE42-NEXT: xorq %r9, %r10
|
|
; SSE42-NEXT: xorq %rdx, %r8
|
|
; SSE42-NEXT: xorq %rsi, %rdi
|
|
; SSE42-NEXT: movq %rdi, (%rax)
|
|
; SSE42-NEXT: movq %r8, 8(%rax)
|
|
; SSE42-NEXT: movq %r10, 16(%rax)
|
|
; SSE42-NEXT: movq %r11, 24(%rax)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: blsmsk_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vmovq %xmm1, %rdx
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX2-NEXT: vmovq %xmm0, %rdi
|
|
; AVX2-NEXT: movq %rdi, %r8
|
|
; AVX2-NEXT: addq $-1, %r8
|
|
; AVX2-NEXT: movq %rsi, %r9
|
|
; AVX2-NEXT: adcq $-1, %r9
|
|
; AVX2-NEXT: movq %rdx, %r10
|
|
; AVX2-NEXT: adcq $-1, %r10
|
|
; AVX2-NEXT: movq %rcx, %r11
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
; AVX2-NEXT: xorq %rcx, %r11
|
|
; AVX2-NEXT: xorq %rdx, %r10
|
|
; AVX2-NEXT: xorq %rsi, %r9
|
|
; AVX2-NEXT: xorq %rdi, %r8
|
|
; AVX2-NEXT: movq %r8, (%rax)
|
|
; AVX2-NEXT: movq %r9, 8(%rax)
|
|
; AVX2-NEXT: movq %r10, 16(%rax)
|
|
; AVX2-NEXT: movq %r11, 24(%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: blsmsk_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512F-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512F-NEXT: movq %rdi, %r8
|
|
; AVX512F-NEXT: addq $-1, %r8
|
|
; AVX512F-NEXT: movq %rsi, %r9
|
|
; AVX512F-NEXT: adcq $-1, %r9
|
|
; AVX512F-NEXT: movq %rdx, %r10
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: movq %rcx, %r11
|
|
; AVX512F-NEXT: adcq $-1, %r11
|
|
; AVX512F-NEXT: xorq %rcx, %r11
|
|
; AVX512F-NEXT: xorq %rdx, %r10
|
|
; AVX512F-NEXT: xorq %rsi, %r9
|
|
; AVX512F-NEXT: xorq %rdi, %r8
|
|
; AVX512F-NEXT: movq %r8, (%rax)
|
|
; AVX512F-NEXT: movq %r9, 8(%rax)
|
|
; AVX512F-NEXT: movq %r10, 16(%rax)
|
|
; AVX512F-NEXT: movq %r11, 24(%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: blsmsk_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VL-NEXT: movq %rdi, %r8
|
|
; AVX512VL-NEXT: addq $-1, %r8
|
|
; AVX512VL-NEXT: movq %rsi, %r9
|
|
; AVX512VL-NEXT: adcq $-1, %r9
|
|
; AVX512VL-NEXT: movq %rdx, %r10
|
|
; AVX512VL-NEXT: adcq $-1, %r10
|
|
; AVX512VL-NEXT: movq %rcx, %r11
|
|
; AVX512VL-NEXT: adcq $-1, %r11
|
|
; AVX512VL-NEXT: xorq %rcx, %r11
|
|
; AVX512VL-NEXT: xorq %rdx, %r10
|
|
; AVX512VL-NEXT: xorq %rsi, %r9
|
|
; AVX512VL-NEXT: xorq %rdi, %r8
|
|
; AVX512VL-NEXT: movq %r8, (%rax)
|
|
; AVX512VL-NEXT: movq %r9, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r11, 24(%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: blsmsk_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VBMI-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VBMI-NEXT: movq %rdi, %r8
|
|
; AVX512VBMI-NEXT: addq $-1, %r8
|
|
; AVX512VBMI-NEXT: movq %rsi, %r9
|
|
; AVX512VBMI-NEXT: adcq $-1, %r9
|
|
; AVX512VBMI-NEXT: movq %rdx, %r10
|
|
; AVX512VBMI-NEXT: adcq $-1, %r10
|
|
; AVX512VBMI-NEXT: movq %rcx, %r11
|
|
; AVX512VBMI-NEXT: adcq $-1, %r11
|
|
; AVX512VBMI-NEXT: xorq %rcx, %r11
|
|
; AVX512VBMI-NEXT: xorq %rdx, %r10
|
|
; AVX512VBMI-NEXT: xorq %rsi, %r9
|
|
; AVX512VBMI-NEXT: xorq %rdi, %r8
|
|
; AVX512VBMI-NEXT: movq %r8, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r9, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r11, 24(%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%dec = sub i256 %a0, 1
|
|
%res = xor i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsmsk_i256_load(ptr %p0) nounwind {
|
|
; SSE-LABEL: blsmsk_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq 24(%rsi), %rcx
|
|
; SSE-NEXT: movq (%rsi), %rdx
|
|
; SSE-NEXT: movq 8(%rsi), %rdi
|
|
; SSE-NEXT: movq %rdx, %r8
|
|
; SSE-NEXT: addq $-1, %r8
|
|
; SSE-NEXT: movq %rdi, %r9
|
|
; SSE-NEXT: adcq $-1, %r9
|
|
; SSE-NEXT: movq 16(%rsi), %rsi
|
|
; SSE-NEXT: movq %rsi, %r10
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movq %rcx, %r11
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: xorq %rcx, %r11
|
|
; SSE-NEXT: xorq %rsi, %r10
|
|
; SSE-NEXT: xorq %rdi, %r9
|
|
; SSE-NEXT: xorq %rdx, %r8
|
|
; SSE-NEXT: movq %r8, (%rax)
|
|
; SSE-NEXT: movq %r9, 8(%rax)
|
|
; SSE-NEXT: movq %r10, 16(%rax)
|
|
; SSE-NEXT: movq %r11, 24(%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsmsk_i256_load:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq 24(%rsi), %rcx
|
|
; AVX-NEXT: movq 16(%rsi), %rdx
|
|
; AVX-NEXT: movq (%rsi), %rdi
|
|
; AVX-NEXT: movq 8(%rsi), %rsi
|
|
; AVX-NEXT: movq %rdi, %r8
|
|
; AVX-NEXT: addq $-1, %r8
|
|
; AVX-NEXT: movq %rsi, %r9
|
|
; AVX-NEXT: adcq $-1, %r9
|
|
; AVX-NEXT: movq %rdx, %r10
|
|
; AVX-NEXT: adcq $-1, %r10
|
|
; AVX-NEXT: movq %rcx, %r11
|
|
; AVX-NEXT: adcq $-1, %r11
|
|
; AVX-NEXT: xorq %rcx, %r11
|
|
; AVX-NEXT: xorq %rdx, %r10
|
|
; AVX-NEXT: xorq %rsi, %r9
|
|
; AVX-NEXT: xorq %rdi, %r8
|
|
; AVX-NEXT: movq %r8, (%rax)
|
|
; AVX-NEXT: movq %r9, 8(%rax)
|
|
; AVX-NEXT: movq %r10, 16(%rax)
|
|
; AVX-NEXT: movq %r11, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%dec = sub i256 %a0, 1
|
|
%res = xor i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsr_i256(i256 %a0) nounwind {
|
|
; SSE-LABEL: blsr_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rsi, %r9
|
|
; SSE-NEXT: addq $-1, %r9
|
|
; SSE-NEXT: movq %rdx, %r10
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq %rcx, %rdi
|
|
; SSE-NEXT: adcq $-1, %rdi
|
|
; SSE-NEXT: movq %r8, %r11
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: andq %r8, %r11
|
|
; SSE-NEXT: andq %rcx, %rdi
|
|
; SSE-NEXT: andq %rsi, %r9
|
|
; SSE-NEXT: andq %rdx, %r10
|
|
; SSE-NEXT: movq %r10, 8(%rax)
|
|
; SSE-NEXT: movq %r9, (%rax)
|
|
; SSE-NEXT: movq %rdi, 16(%rax)
|
|
; SSE-NEXT: movq %r11, 24(%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsr_i256:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq %rsi, %rdi
|
|
; AVX-NEXT: addq $-1, %rdi
|
|
; AVX-NEXT: movq %rdx, %r9
|
|
; AVX-NEXT: adcq $-1, %r9
|
|
; AVX-NEXT: movq %rcx, %r10
|
|
; AVX-NEXT: adcq $-1, %r10
|
|
; AVX-NEXT: movq %r8, %r11
|
|
; AVX-NEXT: adcq $-1, %r11
|
|
; AVX-NEXT: andq %r8, %r11
|
|
; AVX-NEXT: andq %rcx, %r10
|
|
; AVX-NEXT: andq %rsi, %rdi
|
|
; AVX-NEXT: andq %rdx, %r9
|
|
; AVX-NEXT: movq %r9, 8(%rax)
|
|
; AVX-NEXT: movq %rdi, (%rax)
|
|
; AVX-NEXT: movq %r10, 16(%rax)
|
|
; AVX-NEXT: movq %r11, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%dec = sub i256 %a0, 1
|
|
%res = and i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsr_i256_vector(<4 x i64> %v0) nounwind {
|
|
; SSE2-LABEL: blsr_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rcx
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rdx
|
|
; SSE2-NEXT: movq %xmm0, %rsi
|
|
; SSE2-NEXT: movq %rsi, %rdi
|
|
; SSE2-NEXT: addq $-1, %rdi
|
|
; SSE2-NEXT: movq %rdx, %r8
|
|
; SSE2-NEXT: adcq $-1, %r8
|
|
; SSE2-NEXT: movq %xmm1, %r9
|
|
; SSE2-NEXT: movq %r9, %r10
|
|
; SSE2-NEXT: adcq $-1, %r10
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: adcq $-1, %r11
|
|
; SSE2-NEXT: andq %rcx, %r11
|
|
; SSE2-NEXT: andq %r9, %r10
|
|
; SSE2-NEXT: andq %rdx, %r8
|
|
; SSE2-NEXT: andq %rsi, %rdi
|
|
; SSE2-NEXT: movq %rdi, (%rax)
|
|
; SSE2-NEXT: movq %r8, 8(%rax)
|
|
; SSE2-NEXT: movq %r10, 16(%rax)
|
|
; SSE2-NEXT: movq %r11, 24(%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: blsr_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %rcx
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
|
|
; SSE42-NEXT: movq %xmm0, %rsi
|
|
; SSE42-NEXT: movq %rsi, %rdi
|
|
; SSE42-NEXT: addq $-1, %rdi
|
|
; SSE42-NEXT: movq %rdx, %r8
|
|
; SSE42-NEXT: adcq $-1, %r8
|
|
; SSE42-NEXT: movq %xmm1, %r9
|
|
; SSE42-NEXT: movq %r9, %r10
|
|
; SSE42-NEXT: adcq $-1, %r10
|
|
; SSE42-NEXT: movq %rcx, %r11
|
|
; SSE42-NEXT: adcq $-1, %r11
|
|
; SSE42-NEXT: andq %rcx, %r11
|
|
; SSE42-NEXT: andq %r9, %r10
|
|
; SSE42-NEXT: andq %rdx, %r8
|
|
; SSE42-NEXT: andq %rsi, %rdi
|
|
; SSE42-NEXT: movq %rdi, (%rax)
|
|
; SSE42-NEXT: movq %r8, 8(%rax)
|
|
; SSE42-NEXT: movq %r10, 16(%rax)
|
|
; SSE42-NEXT: movq %r11, 24(%rax)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: blsr_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vmovq %xmm1, %rdx
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX2-NEXT: vmovq %xmm0, %rdi
|
|
; AVX2-NEXT: movq %rdi, %r8
|
|
; AVX2-NEXT: addq $-1, %r8
|
|
; AVX2-NEXT: movq %rsi, %r9
|
|
; AVX2-NEXT: adcq $-1, %r9
|
|
; AVX2-NEXT: movq %rdx, %r10
|
|
; AVX2-NEXT: adcq $-1, %r10
|
|
; AVX2-NEXT: movq %rcx, %r11
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
; AVX2-NEXT: andq %rcx, %r11
|
|
; AVX2-NEXT: andq %rdx, %r10
|
|
; AVX2-NEXT: andq %rsi, %r9
|
|
; AVX2-NEXT: andq %rdi, %r8
|
|
; AVX2-NEXT: movq %r8, (%rax)
|
|
; AVX2-NEXT: movq %r9, 8(%rax)
|
|
; AVX2-NEXT: movq %r10, 16(%rax)
|
|
; AVX2-NEXT: movq %r11, 24(%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: blsr_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512F-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512F-NEXT: movq %rdi, %r8
|
|
; AVX512F-NEXT: addq $-1, %r8
|
|
; AVX512F-NEXT: movq %rsi, %r9
|
|
; AVX512F-NEXT: adcq $-1, %r9
|
|
; AVX512F-NEXT: movq %rdx, %r10
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: movq %rcx, %r11
|
|
; AVX512F-NEXT: adcq $-1, %r11
|
|
; AVX512F-NEXT: andq %rcx, %r11
|
|
; AVX512F-NEXT: andq %rdx, %r10
|
|
; AVX512F-NEXT: andq %rsi, %r9
|
|
; AVX512F-NEXT: andq %rdi, %r8
|
|
; AVX512F-NEXT: movq %r8, (%rax)
|
|
; AVX512F-NEXT: movq %r9, 8(%rax)
|
|
; AVX512F-NEXT: movq %r10, 16(%rax)
|
|
; AVX512F-NEXT: movq %r11, 24(%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: blsr_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VL-NEXT: movq %rdi, %r8
|
|
; AVX512VL-NEXT: addq $-1, %r8
|
|
; AVX512VL-NEXT: movq %rsi, %r9
|
|
; AVX512VL-NEXT: adcq $-1, %r9
|
|
; AVX512VL-NEXT: movq %rdx, %r10
|
|
; AVX512VL-NEXT: adcq $-1, %r10
|
|
; AVX512VL-NEXT: movq %rcx, %r11
|
|
; AVX512VL-NEXT: adcq $-1, %r11
|
|
; AVX512VL-NEXT: andq %rcx, %r11
|
|
; AVX512VL-NEXT: andq %rdx, %r10
|
|
; AVX512VL-NEXT: andq %rsi, %r9
|
|
; AVX512VL-NEXT: andq %rdi, %r8
|
|
; AVX512VL-NEXT: movq %r8, (%rax)
|
|
; AVX512VL-NEXT: movq %r9, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r11, 24(%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: blsr_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VBMI-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VBMI-NEXT: movq %rdi, %r8
|
|
; AVX512VBMI-NEXT: addq $-1, %r8
|
|
; AVX512VBMI-NEXT: movq %rsi, %r9
|
|
; AVX512VBMI-NEXT: adcq $-1, %r9
|
|
; AVX512VBMI-NEXT: movq %rdx, %r10
|
|
; AVX512VBMI-NEXT: adcq $-1, %r10
|
|
; AVX512VBMI-NEXT: movq %rcx, %r11
|
|
; AVX512VBMI-NEXT: adcq $-1, %r11
|
|
; AVX512VBMI-NEXT: andq %rcx, %r11
|
|
; AVX512VBMI-NEXT: andq %rdx, %r10
|
|
; AVX512VBMI-NEXT: andq %rsi, %r9
|
|
; AVX512VBMI-NEXT: andq %rdi, %r8
|
|
; AVX512VBMI-NEXT: movq %r8, (%rax)
|
|
; AVX512VBMI-NEXT: movq %r9, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r11, 24(%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%dec = sub i256 %a0, 1
|
|
%res = and i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @blsr_i256_load(ptr %p0) nounwind {
|
|
; SSE-LABEL: blsr_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq 24(%rsi), %rcx
|
|
; SSE-NEXT: movq (%rsi), %rdx
|
|
; SSE-NEXT: movq 8(%rsi), %rdi
|
|
; SSE-NEXT: movq %rdx, %r8
|
|
; SSE-NEXT: addq $-1, %r8
|
|
; SSE-NEXT: movq %rdi, %r9
|
|
; SSE-NEXT: adcq $-1, %r9
|
|
; SSE-NEXT: movq 16(%rsi), %rsi
|
|
; SSE-NEXT: movq %rsi, %r10
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: movq %rcx, %r11
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: andq %rcx, %r11
|
|
; SSE-NEXT: andq %rsi, %r10
|
|
; SSE-NEXT: andq %rdi, %r9
|
|
; SSE-NEXT: andq %rdx, %r8
|
|
; SSE-NEXT: movq %r8, (%rax)
|
|
; SSE-NEXT: movq %r9, 8(%rax)
|
|
; SSE-NEXT: movq %r10, 16(%rax)
|
|
; SSE-NEXT: movq %r11, 24(%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: blsr_i256_load:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq 24(%rsi), %rcx
|
|
; AVX-NEXT: movq 16(%rsi), %rdx
|
|
; AVX-NEXT: movq (%rsi), %rdi
|
|
; AVX-NEXT: movq 8(%rsi), %rsi
|
|
; AVX-NEXT: movq %rdi, %r8
|
|
; AVX-NEXT: addq $-1, %r8
|
|
; AVX-NEXT: movq %rsi, %r9
|
|
; AVX-NEXT: adcq $-1, %r9
|
|
; AVX-NEXT: movq %rdx, %r10
|
|
; AVX-NEXT: adcq $-1, %r10
|
|
; AVX-NEXT: movq %rcx, %r11
|
|
; AVX-NEXT: adcq $-1, %r11
|
|
; AVX-NEXT: andq %rcx, %r11
|
|
; AVX-NEXT: andq %rdx, %r10
|
|
; AVX-NEXT: andq %rsi, %r9
|
|
; AVX-NEXT: andq %rdi, %r8
|
|
; AVX-NEXT: movq %r8, (%rax)
|
|
; AVX-NEXT: movq %r9, 8(%rax)
|
|
; AVX-NEXT: movq %r10, 16(%rax)
|
|
; AVX-NEXT: movq %r11, 24(%rax)
|
|
; AVX-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%dec = sub i256 %a0, 1
|
|
%res = and i256 %a0, %dec
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bzhi_i256(i256 %a0, i256 %idx) nounwind {
|
|
; SSE-LABEL: bzhi_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %r14
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: movq %rcx, %rax
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %r9d, %ecx
|
|
; SSE-NEXT: shrb $3, %cl
|
|
; SSE-NEXT: andb $24, %cl
|
|
; SSE-NEXT: negb %cl
|
|
; SSE-NEXT: movsbq %cl, %r14
|
|
; SSE-NEXT: movq -24(%rsp,%r14), %r10
|
|
; SSE-NEXT: movq -16(%rsp,%r14), %r11
|
|
; SSE-NEXT: movl %r9d, %ecx
|
|
; SSE-NEXT: shldq %cl, %r10, %r11
|
|
; SSE-NEXT: movq -32(%rsp,%r14), %rbx
|
|
; SSE-NEXT: shldq %cl, %rbx, %r10
|
|
; SSE-NEXT: movq -40(%rsp,%r14), %r14
|
|
; SSE-NEXT: shldq %cl, %r14, %rbx
|
|
; SSE-NEXT: shlq %cl, %r14
|
|
; SSE-NEXT: addq $-1, %r14
|
|
; SSE-NEXT: adcq $-1, %rbx
|
|
; SSE-NEXT: adcq $-1, %r10
|
|
; SSE-NEXT: adcq $-1, %r11
|
|
; SSE-NEXT: andq %r8, %r11
|
|
; SSE-NEXT: andq %rax, %r10
|
|
; SSE-NEXT: andq %rdx, %rbx
|
|
; SSE-NEXT: andq %rsi, %r14
|
|
; SSE-NEXT: movq %r14, (%rdi)
|
|
; SSE-NEXT: movq %rbx, 8(%rdi)
|
|
; SSE-NEXT: movq %r10, 16(%rdi)
|
|
; SSE-NEXT: movq %r11, 24(%rdi)
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: popq %r14
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bzhi_i256:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %r14
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: movq %rcx, %rax
|
|
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %r9d, %ecx
|
|
; AVX2-NEXT: shrb $3, %cl
|
|
; AVX2-NEXT: andb $24, %cl
|
|
; AVX2-NEXT: negb %cl
|
|
; AVX2-NEXT: movsbq %cl, %rbx
|
|
; AVX2-NEXT: movq -24(%rsp,%rbx), %r10
|
|
; AVX2-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX2-NEXT: movl %r9d, %ecx
|
|
; AVX2-NEXT: shldq %cl, %r10, %r11
|
|
; AVX2-NEXT: movq -40(%rsp,%rbx), %r14
|
|
; AVX2-NEXT: movq -32(%rsp,%rbx), %rbx
|
|
; AVX2-NEXT: shldq %cl, %rbx, %r10
|
|
; AVX2-NEXT: shldq %cl, %r14, %rbx
|
|
; AVX2-NEXT: shlxq %r9, %r14, %rcx
|
|
; AVX2-NEXT: addq $-1, %rcx
|
|
; AVX2-NEXT: adcq $-1, %rbx
|
|
; AVX2-NEXT: adcq $-1, %r10
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
; AVX2-NEXT: andq %r8, %r11
|
|
; AVX2-NEXT: andq %rax, %r10
|
|
; AVX2-NEXT: andq %rdx, %rbx
|
|
; AVX2-NEXT: andq %rsi, %rcx
|
|
; AVX2-NEXT: movq %rcx, (%rdi)
|
|
; AVX2-NEXT: movq %rbx, 8(%rdi)
|
|
; AVX2-NEXT: movq %r10, 16(%rdi)
|
|
; AVX2-NEXT: movq %r11, 24(%rdi)
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: popq %r14
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bzhi_i256:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %r14
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: movq %rcx, %rax
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %r9d, %ecx
|
|
; AVX512F-NEXT: shrb $3, %cl
|
|
; AVX512F-NEXT: andb $24, %cl
|
|
; AVX512F-NEXT: negb %cl
|
|
; AVX512F-NEXT: movsbq %cl, %rbx
|
|
; AVX512F-NEXT: movq -24(%rsp,%rbx), %r10
|
|
; AVX512F-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512F-NEXT: movl %r9d, %ecx
|
|
; AVX512F-NEXT: shldq %cl, %r10, %r11
|
|
; AVX512F-NEXT: movq -40(%rsp,%rbx), %r14
|
|
; AVX512F-NEXT: movq -32(%rsp,%rbx), %rbx
|
|
; AVX512F-NEXT: shldq %cl, %rbx, %r10
|
|
; AVX512F-NEXT: shldq %cl, %r14, %rbx
|
|
; AVX512F-NEXT: shlxq %r9, %r14, %rcx
|
|
; AVX512F-NEXT: addq $-1, %rcx
|
|
; AVX512F-NEXT: adcq $-1, %rbx
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: adcq $-1, %r11
|
|
; AVX512F-NEXT: andq %r8, %r11
|
|
; AVX512F-NEXT: andq %rax, %r10
|
|
; AVX512F-NEXT: andq %rdx, %rbx
|
|
; AVX512F-NEXT: andq %rsi, %rcx
|
|
; AVX512F-NEXT: movq %rcx, (%rdi)
|
|
; AVX512F-NEXT: movq %rbx, 8(%rdi)
|
|
; AVX512F-NEXT: movq %r10, 16(%rdi)
|
|
; AVX512F-NEXT: movq %r11, 24(%rdi)
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: popq %r14
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bzhi_i256:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %r14
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %rcx, %rax
|
|
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shrb $3, %cl
|
|
; AVX512VL-NEXT: andb $24, %cl
|
|
; AVX512VL-NEXT: negb %cl
|
|
; AVX512VL-NEXT: movsbq %cl, %rbx
|
|
; AVX512VL-NEXT: movq -24(%rsp,%rbx), %r10
|
|
; AVX512VL-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VL-NEXT: movl %r9d, %ecx
|
|
; AVX512VL-NEXT: shldq %cl, %r10, %r11
|
|
; AVX512VL-NEXT: movq -32(%rsp,%rbx), %r14
|
|
; AVX512VL-NEXT: shldq %cl, %r14, %r10
|
|
; AVX512VL-NEXT: movq -40(%rsp,%rbx), %rbx
|
|
; AVX512VL-NEXT: shldq %cl, %rbx, %r14
|
|
; AVX512VL-NEXT: shlxq %r9, %rbx, %rcx
|
|
; AVX512VL-NEXT: addq $-1, %rcx
|
|
; AVX512VL-NEXT: adcq $-1, %r14
|
|
; AVX512VL-NEXT: adcq $-1, %r10
|
|
; AVX512VL-NEXT: adcq $-1, %r11
|
|
; AVX512VL-NEXT: andq %r8, %r11
|
|
; AVX512VL-NEXT: andq %rax, %r10
|
|
; AVX512VL-NEXT: andq %rdx, %r14
|
|
; AVX512VL-NEXT: andq %rsi, %rcx
|
|
; AVX512VL-NEXT: movq %rcx, (%rdi)
|
|
; AVX512VL-NEXT: movq %r14, 8(%rdi)
|
|
; AVX512VL-NEXT: movq %r10, 16(%rdi)
|
|
; AVX512VL-NEXT: movq %r11, 24(%rdi)
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: popq %r14
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bzhi_i256:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %r14
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %rcx, %rax
|
|
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shrb $3, %cl
|
|
; AVX512VBMI-NEXT: andb $24, %cl
|
|
; AVX512VBMI-NEXT: negb %cl
|
|
; AVX512VBMI-NEXT: movsbq %cl, %rbx
|
|
; AVX512VBMI-NEXT: movq -24(%rsp,%rbx), %r10
|
|
; AVX512VBMI-NEXT: movq -16(%rsp,%rbx), %r11
|
|
; AVX512VBMI-NEXT: movl %r9d, %ecx
|
|
; AVX512VBMI-NEXT: shldq %cl, %r10, %r11
|
|
; AVX512VBMI-NEXT: movq -32(%rsp,%rbx), %r14
|
|
; AVX512VBMI-NEXT: shldq %cl, %r14, %r10
|
|
; AVX512VBMI-NEXT: movq -40(%rsp,%rbx), %rbx
|
|
; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
|
|
; AVX512VBMI-NEXT: shlxq %r9, %rbx, %rcx
|
|
; AVX512VBMI-NEXT: addq $-1, %rcx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r14
|
|
; AVX512VBMI-NEXT: adcq $-1, %r10
|
|
; AVX512VBMI-NEXT: adcq $-1, %r11
|
|
; AVX512VBMI-NEXT: andq %r8, %r11
|
|
; AVX512VBMI-NEXT: andq %rax, %r10
|
|
; AVX512VBMI-NEXT: andq %rdx, %r14
|
|
; AVX512VBMI-NEXT: andq %rsi, %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, (%rdi)
|
|
; AVX512VBMI-NEXT: movq %r14, 8(%rdi)
|
|
; AVX512VBMI-NEXT: movq %r10, 16(%rdi)
|
|
; AVX512VBMI-NEXT: movq %r11, 24(%rdi)
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: popq %r14
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%bit = shl i256 1, %idx
|
|
%msk = sub i256 %bit, 1
|
|
%res = and i256 %a0, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bzhi_i256_vector(<4 x i64> %v0, i256 %idx) nounwind {
|
|
; SSE2-LABEL: bzhi_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rsi, %rcx
|
|
; SSE2-NEXT: xorps %xmm2, %xmm2
|
|
; SSE2-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movl %ecx, %eax
|
|
; SSE2-NEXT: shrb $3, %al
|
|
; SSE2-NEXT: andb $24, %al
|
|
; SSE2-NEXT: negb %al
|
|
; SSE2-NEXT: movsbq %al, %r8
|
|
; SSE2-NEXT: movq -24(%rsp,%r8), %rdx
|
|
; SSE2-NEXT: movq -16(%rsp,%r8), %rsi
|
|
; SSE2-NEXT: shldq %cl, %rdx, %rsi
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: movq -32(%rsp,%r8), %rdi
|
|
; SSE2-NEXT: shldq %cl, %rdi, %rdx
|
|
; SSE2-NEXT: movq %xmm0, %r9
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq -40(%rsp,%r8), %r8
|
|
; SSE2-NEXT: shldq %cl, %r8, %rdi
|
|
; SSE2-NEXT: movq %xmm0, %r10
|
|
; SSE2-NEXT: movq %xmm1, %r11
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE2-NEXT: shlq %cl, %r8
|
|
; SSE2-NEXT: addq $-1, %r8
|
|
; SSE2-NEXT: adcq $-1, %rdi
|
|
; SSE2-NEXT: adcq $-1, %rdx
|
|
; SSE2-NEXT: movq %xmm0, %rcx
|
|
; SSE2-NEXT: adcq $-1, %rsi
|
|
; SSE2-NEXT: andq %rcx, %rsi
|
|
; SSE2-NEXT: andq %r11, %rdx
|
|
; SSE2-NEXT: andq %r10, %rdi
|
|
; SSE2-NEXT: andq %r9, %r8
|
|
; SSE2-NEXT: movq %r8, (%rax)
|
|
; SSE2-NEXT: movq %rdi, 8(%rax)
|
|
; SSE2-NEXT: movq %rdx, 16(%rax)
|
|
; SSE2-NEXT: movq %rsi, 24(%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: bzhi_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rsi, %rcx
|
|
; SSE42-NEXT: xorps %xmm2, %xmm2
|
|
; SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movl %ecx, %eax
|
|
; SSE42-NEXT: shrb $3, %al
|
|
; SSE42-NEXT: andb $24, %al
|
|
; SSE42-NEXT: negb %al
|
|
; SSE42-NEXT: movsbq %al, %r8
|
|
; SSE42-NEXT: movq -24(%rsp,%r8), %rdx
|
|
; SSE42-NEXT: movq -16(%rsp,%r8), %rsi
|
|
; SSE42-NEXT: shldq %cl, %rdx, %rsi
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: movq -32(%rsp,%r8), %rdi
|
|
; SSE42-NEXT: shldq %cl, %rdi, %rdx
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %r9
|
|
; SSE42-NEXT: movq -40(%rsp,%r8), %r8
|
|
; SSE42-NEXT: shldq %cl, %r8, %rdi
|
|
; SSE42-NEXT: movq %xmm0, %r10
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %r11
|
|
; SSE42-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE42-NEXT: shlq %cl, %r8
|
|
; SSE42-NEXT: addq $-1, %r8
|
|
; SSE42-NEXT: adcq $-1, %rdi
|
|
; SSE42-NEXT: adcq $-1, %rdx
|
|
; SSE42-NEXT: movq %xmm1, %rcx
|
|
; SSE42-NEXT: adcq $-1, %rsi
|
|
; SSE42-NEXT: andq %r11, %rsi
|
|
; SSE42-NEXT: andq %rcx, %rdx
|
|
; SSE42-NEXT: andq %r9, %rdi
|
|
; SSE42-NEXT: andq %r10, %r8
|
|
; SSE42-NEXT: movq %r8, (%rax)
|
|
; SSE42-NEXT: movq %rdi, 8(%rax)
|
|
; SSE42-NEXT: movq %rdx, 16(%rax)
|
|
; SSE42-NEXT: movq %rsi, 24(%rax)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bzhi_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movq %rsi, %rcx
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vmovq %xmm0, %rdx
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
; AVX2-NEXT: vmovq %xmm0, %rdi
|
|
; AVX2-NEXT: movl %ecx, %r8d
|
|
; AVX2-NEXT: shrb $3, %r8b
|
|
; AVX2-NEXT: andb $24, %r8b
|
|
; AVX2-NEXT: negb %r8b
|
|
; AVX2-NEXT: movsbq %r8b, %r10
|
|
; AVX2-NEXT: movq -16(%rsp,%r10), %r8
|
|
; AVX2-NEXT: movq -8(%rsp,%r10), %r9
|
|
; AVX2-NEXT: shldq %cl, %r8, %r9
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %r11
|
|
; AVX2-NEXT: movq -32(%rsp,%r10), %rbx
|
|
; AVX2-NEXT: movq -24(%rsp,%r10), %r10
|
|
; AVX2-NEXT: shldq %cl, %r10, %r8
|
|
; AVX2-NEXT: shldq %cl, %rbx, %r10
|
|
; AVX2-NEXT: shlxq %rcx, %rbx, %rcx
|
|
; AVX2-NEXT: addq $-1, %rcx
|
|
; AVX2-NEXT: adcq $-1, %r10
|
|
; AVX2-NEXT: adcq $-1, %r8
|
|
; AVX2-NEXT: adcq $-1, %r9
|
|
; AVX2-NEXT: andq %r11, %r9
|
|
; AVX2-NEXT: andq %rdi, %r8
|
|
; AVX2-NEXT: andq %rsi, %r10
|
|
; AVX2-NEXT: andq %rdx, %rcx
|
|
; AVX2-NEXT: movq %rcx, (%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %r8, 16(%rax)
|
|
; AVX2-NEXT: movq %r9, 24(%rax)
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bzhi_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movq %rsi, %rcx
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vmovq %xmm0, %rdx
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
; AVX512F-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512F-NEXT: movl %ecx, %r8d
|
|
; AVX512F-NEXT: shrb $3, %r8b
|
|
; AVX512F-NEXT: andb $24, %r8b
|
|
; AVX512F-NEXT: negb %r8b
|
|
; AVX512F-NEXT: movsbq %r8b, %r10
|
|
; AVX512F-NEXT: movq -16(%rsp,%r10), %r8
|
|
; AVX512F-NEXT: movq -8(%rsp,%r10), %r9
|
|
; AVX512F-NEXT: shldq %cl, %r8, %r9
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %r11
|
|
; AVX512F-NEXT: movq -32(%rsp,%r10), %rbx
|
|
; AVX512F-NEXT: movq -24(%rsp,%r10), %r10
|
|
; AVX512F-NEXT: shldq %cl, %r10, %r8
|
|
; AVX512F-NEXT: shldq %cl, %rbx, %r10
|
|
; AVX512F-NEXT: shlxq %rcx, %rbx, %rcx
|
|
; AVX512F-NEXT: addq $-1, %rcx
|
|
; AVX512F-NEXT: adcq $-1, %r10
|
|
; AVX512F-NEXT: adcq $-1, %r8
|
|
; AVX512F-NEXT: adcq $-1, %r9
|
|
; AVX512F-NEXT: andq %r11, %r9
|
|
; AVX512F-NEXT: andq %rdi, %r8
|
|
; AVX512F-NEXT: andq %rsi, %r10
|
|
; AVX512F-NEXT: andq %rdx, %rcx
|
|
; AVX512F-NEXT: movq %rcx, (%rax)
|
|
; AVX512F-NEXT: movq %r10, 8(%rax)
|
|
; AVX512F-NEXT: movq %r8, 16(%rax)
|
|
; AVX512F-NEXT: movq %r9, 24(%rax)
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bzhi_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %rsi, %rcx
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [1,0]
|
|
; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rdx
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdi
|
|
; AVX512VL-NEXT: movl %ecx, %r8d
|
|
; AVX512VL-NEXT: shrb $3, %r8b
|
|
; AVX512VL-NEXT: andb $24, %r8b
|
|
; AVX512VL-NEXT: negb %r8b
|
|
; AVX512VL-NEXT: movsbq %r8b, %r10
|
|
; AVX512VL-NEXT: movq -16(%rsp,%r10), %r8
|
|
; AVX512VL-NEXT: movq -8(%rsp,%r10), %r9
|
|
; AVX512VL-NEXT: shldq %cl, %r8, %r9
|
|
; AVX512VL-NEXT: vmovq %xmm0, %r11
|
|
; AVX512VL-NEXT: movq -24(%rsp,%r10), %rbx
|
|
; AVX512VL-NEXT: shldq %cl, %rbx, %r8
|
|
; AVX512VL-NEXT: movq -32(%rsp,%r10), %r10
|
|
; AVX512VL-NEXT: shldq %cl, %r10, %rbx
|
|
; AVX512VL-NEXT: shlxq %rcx, %r10, %rcx
|
|
; AVX512VL-NEXT: addq $-1, %rcx
|
|
; AVX512VL-NEXT: adcq $-1, %rbx
|
|
; AVX512VL-NEXT: adcq $-1, %r8
|
|
; AVX512VL-NEXT: adcq $-1, %r9
|
|
; AVX512VL-NEXT: andq %rdi, %r9
|
|
; AVX512VL-NEXT: andq %r11, %r8
|
|
; AVX512VL-NEXT: andq %rsi, %rbx
|
|
; AVX512VL-NEXT: andq %rdx, %rcx
|
|
; AVX512VL-NEXT: movq %rcx, (%rax)
|
|
; AVX512VL-NEXT: movq %rbx, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r9, 24(%rax)
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bzhi_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %rsi, %rcx
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm1 = [1,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %rdx
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdi
|
|
; AVX512VBMI-NEXT: movl %ecx, %r8d
|
|
; AVX512VBMI-NEXT: shrb $3, %r8b
|
|
; AVX512VBMI-NEXT: andb $24, %r8b
|
|
; AVX512VBMI-NEXT: negb %r8b
|
|
; AVX512VBMI-NEXT: movsbq %r8b, %r10
|
|
; AVX512VBMI-NEXT: movq -16(%rsp,%r10), %r8
|
|
; AVX512VBMI-NEXT: movq -8(%rsp,%r10), %r9
|
|
; AVX512VBMI-NEXT: shldq %cl, %r8, %r9
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %r11
|
|
; AVX512VBMI-NEXT: movq -24(%rsp,%r10), %rbx
|
|
; AVX512VBMI-NEXT: shldq %cl, %rbx, %r8
|
|
; AVX512VBMI-NEXT: movq -32(%rsp,%r10), %r10
|
|
; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
|
|
; AVX512VBMI-NEXT: shlxq %rcx, %r10, %rcx
|
|
; AVX512VBMI-NEXT: addq $-1, %rcx
|
|
; AVX512VBMI-NEXT: adcq $-1, %rbx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r8
|
|
; AVX512VBMI-NEXT: adcq $-1, %r9
|
|
; AVX512VBMI-NEXT: andq %rdi, %r9
|
|
; AVX512VBMI-NEXT: andq %r11, %r8
|
|
; AVX512VBMI-NEXT: andq %rsi, %rbx
|
|
; AVX512VBMI-NEXT: andq %rdx, %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, (%rax)
|
|
; AVX512VBMI-NEXT: movq %rbx, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r9, 24(%rax)
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%bit = shl i256 1, %idx
|
|
%msk = sub i256 %bit, 1
|
|
%res = and i256 %a0, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bzhi_i256_load(ptr %p0, i256 %idx) nounwind {
|
|
; SSE-LABEL: bzhi_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rdx, %rcx
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %ecx, %eax
|
|
; SSE-NEXT: shrb $3, %al
|
|
; SSE-NEXT: andb $24, %al
|
|
; SSE-NEXT: negb %al
|
|
; SSE-NEXT: movsbq %al, %rax
|
|
; SSE-NEXT: movq -24(%rsp,%rax), %rdx
|
|
; SSE-NEXT: movq -16(%rsp,%rax), %r8
|
|
; SSE-NEXT: shldq %cl, %rdx, %r8
|
|
; SSE-NEXT: movq -32(%rsp,%rax), %r9
|
|
; SSE-NEXT: shldq %cl, %r9, %rdx
|
|
; SSE-NEXT: movq -40(%rsp,%rax), %r10
|
|
; SSE-NEXT: shldq %cl, %r10, %r9
|
|
; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE-NEXT: shlq %cl, %r10
|
|
; SSE-NEXT: addq $-1, %r10
|
|
; SSE-NEXT: adcq $-1, %r9
|
|
; SSE-NEXT: adcq $-1, %rdx
|
|
; SSE-NEXT: adcq $-1, %r8
|
|
; SSE-NEXT: andq 24(%rsi), %r8
|
|
; SSE-NEXT: andq 16(%rsi), %rdx
|
|
; SSE-NEXT: andq 8(%rsi), %r9
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: andq (%rsi), %r10
|
|
; SSE-NEXT: movq %r10, (%rdi)
|
|
; SSE-NEXT: movq %r9, 8(%rdi)
|
|
; SSE-NEXT: movq %rdx, 16(%rdi)
|
|
; SSE-NEXT: movq %r8, 24(%rdi)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bzhi_i256_load:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: movq %rdx, %rcx
|
|
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $3, %al
|
|
; AVX2-NEXT: andb $24, %al
|
|
; AVX2-NEXT: negb %al
|
|
; AVX2-NEXT: movsbq %al, %r9
|
|
; AVX2-NEXT: movq -24(%rsp,%r9), %rdx
|
|
; AVX2-NEXT: movq -16(%rsp,%r9), %r8
|
|
; AVX2-NEXT: shldq %cl, %rdx, %r8
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -40(%rsp,%r9), %rdi
|
|
; AVX2-NEXT: movq -32(%rsp,%r9), %r9
|
|
; AVX2-NEXT: shldq %cl, %r9, %rdx
|
|
; AVX2-NEXT: shldq %cl, %rdi, %r9
|
|
; AVX2-NEXT: shlxq %rcx, %rdi, %rcx
|
|
; AVX2-NEXT: addq $-1, %rcx
|
|
; AVX2-NEXT: adcq $-1, %r9
|
|
; AVX2-NEXT: adcq $-1, %rdx
|
|
; AVX2-NEXT: adcq $-1, %r8
|
|
; AVX2-NEXT: andq 24(%rsi), %r8
|
|
; AVX2-NEXT: andq 16(%rsi), %rdx
|
|
; AVX2-NEXT: andq 8(%rsi), %r9
|
|
; AVX2-NEXT: andq (%rsi), %rcx
|
|
; AVX2-NEXT: movq %rcx, (%rax)
|
|
; AVX2-NEXT: movq %r9, 8(%rax)
|
|
; AVX2-NEXT: movq %rdx, 16(%rax)
|
|
; AVX2-NEXT: movq %r8, 24(%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bzhi_i256_load:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: movq %rdx, %rcx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,1,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $3, %al
|
|
; AVX512F-NEXT: andb $24, %al
|
|
; AVX512F-NEXT: negb %al
|
|
; AVX512F-NEXT: movsbq %al, %r9
|
|
; AVX512F-NEXT: movq -24(%rsp,%r9), %rdx
|
|
; AVX512F-NEXT: movq -16(%rsp,%r9), %r8
|
|
; AVX512F-NEXT: shldq %cl, %rdx, %r8
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: movq -40(%rsp,%r9), %rdi
|
|
; AVX512F-NEXT: movq -32(%rsp,%r9), %r9
|
|
; AVX512F-NEXT: shldq %cl, %r9, %rdx
|
|
; AVX512F-NEXT: shldq %cl, %rdi, %r9
|
|
; AVX512F-NEXT: shlxq %rcx, %rdi, %rcx
|
|
; AVX512F-NEXT: addq $-1, %rcx
|
|
; AVX512F-NEXT: adcq $-1, %r9
|
|
; AVX512F-NEXT: adcq $-1, %rdx
|
|
; AVX512F-NEXT: adcq $-1, %r8
|
|
; AVX512F-NEXT: andq 24(%rsi), %r8
|
|
; AVX512F-NEXT: andq 16(%rsi), %rdx
|
|
; AVX512F-NEXT: andq 8(%rsi), %r9
|
|
; AVX512F-NEXT: andq (%rsi), %rcx
|
|
; AVX512F-NEXT: movq %rcx, (%rax)
|
|
; AVX512F-NEXT: movq %r9, 8(%rax)
|
|
; AVX512F-NEXT: movq %rdx, 16(%rax)
|
|
; AVX512F-NEXT: movq %r8, 24(%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bzhi_i256_load:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdx, %rcx
|
|
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %ecx, %eax
|
|
; AVX512VL-NEXT: shrb $3, %al
|
|
; AVX512VL-NEXT: andb $24, %al
|
|
; AVX512VL-NEXT: negb %al
|
|
; AVX512VL-NEXT: movsbq %al, %rax
|
|
; AVX512VL-NEXT: movq -24(%rsp,%rax), %rdx
|
|
; AVX512VL-NEXT: movq -16(%rsp,%rax), %r8
|
|
; AVX512VL-NEXT: shldq %cl, %rdx, %r8
|
|
; AVX512VL-NEXT: movq -32(%rsp,%rax), %r9
|
|
; AVX512VL-NEXT: shldq %cl, %r9, %rdx
|
|
; AVX512VL-NEXT: movq -40(%rsp,%rax), %r10
|
|
; AVX512VL-NEXT: shldq %cl, %r10, %r9
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: shlxq %rcx, %r10, %rcx
|
|
; AVX512VL-NEXT: addq $-1, %rcx
|
|
; AVX512VL-NEXT: adcq $-1, %r9
|
|
; AVX512VL-NEXT: adcq $-1, %rdx
|
|
; AVX512VL-NEXT: adcq $-1, %r8
|
|
; AVX512VL-NEXT: andq 24(%rsi), %r8
|
|
; AVX512VL-NEXT: andq 16(%rsi), %rdx
|
|
; AVX512VL-NEXT: andq 8(%rsi), %r9
|
|
; AVX512VL-NEXT: andq (%rsi), %rcx
|
|
; AVX512VL-NEXT: movq %rcx, (%rdi)
|
|
; AVX512VL-NEXT: movq %r9, 8(%rdi)
|
|
; AVX512VL-NEXT: movq %rdx, 16(%rdi)
|
|
; AVX512VL-NEXT: movq %r8, 24(%rdi)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bzhi_i256_load:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdx, %rcx
|
|
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %ecx, %eax
|
|
; AVX512VBMI-NEXT: shrb $3, %al
|
|
; AVX512VBMI-NEXT: andb $24, %al
|
|
; AVX512VBMI-NEXT: negb %al
|
|
; AVX512VBMI-NEXT: movsbq %al, %rax
|
|
; AVX512VBMI-NEXT: movq -24(%rsp,%rax), %rdx
|
|
; AVX512VBMI-NEXT: movq -16(%rsp,%rax), %r8
|
|
; AVX512VBMI-NEXT: shldq %cl, %rdx, %r8
|
|
; AVX512VBMI-NEXT: movq -32(%rsp,%rax), %r9
|
|
; AVX512VBMI-NEXT: shldq %cl, %r9, %rdx
|
|
; AVX512VBMI-NEXT: movq -40(%rsp,%rax), %r10
|
|
; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: shlxq %rcx, %r10, %rcx
|
|
; AVX512VBMI-NEXT: addq $-1, %rcx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r9
|
|
; AVX512VBMI-NEXT: adcq $-1, %rdx
|
|
; AVX512VBMI-NEXT: adcq $-1, %r8
|
|
; AVX512VBMI-NEXT: andq 24(%rsi), %r8
|
|
; AVX512VBMI-NEXT: andq 16(%rsi), %rdx
|
|
; AVX512VBMI-NEXT: andq 8(%rsi), %r9
|
|
; AVX512VBMI-NEXT: andq (%rsi), %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, (%rdi)
|
|
; AVX512VBMI-NEXT: movq %r9, 8(%rdi)
|
|
; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
|
|
; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%bit = shl i256 1, %idx
|
|
%msk = sub i256 %bit, 1
|
|
%res = and i256 %a0, %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @isolate_msb_i256(i256 %a0, i256 %idx) nounwind {
|
|
; SSE-LABEL: isolate_msb_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: pushq %rbx
|
|
; SSE-NEXT: movq %rcx, %rax
|
|
; SSE-NEXT: movq %rdx, %r9
|
|
; SSE-NEXT: orq %r8, %r9
|
|
; SSE-NEXT: bsrq %rsi, %rcx
|
|
; SSE-NEXT: orq %rax, %rsi
|
|
; SSE-NEXT: bsrq %r8, %r10
|
|
; SSE-NEXT: xorq $63, %r10
|
|
; SSE-NEXT: bsrq %rax, %r11
|
|
; SSE-NEXT: xorq $63, %r11
|
|
; SSE-NEXT: orq $64, %r11
|
|
; SSE-NEXT: testq %r8, %r8
|
|
; SSE-NEXT: cmovneq %r10, %r11
|
|
; SSE-NEXT: bsrq %rdx, %r10
|
|
; SSE-NEXT: xorq $63, %r10
|
|
; SSE-NEXT: xorq $63, %rcx
|
|
; SSE-NEXT: orq $64, %rcx
|
|
; SSE-NEXT: testq %rdx, %rdx
|
|
; SSE-NEXT: cmovneq %r10, %rcx
|
|
; SSE-NEXT: orq $128, %rcx
|
|
; SSE-NEXT: orq %r8, %rax
|
|
; SSE-NEXT: cmovneq %r11, %rcx
|
|
; SSE-NEXT: xorps %xmm0, %xmm0
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movl %ecx, %eax
|
|
; SSE-NEXT: shrb $6, %al
|
|
; SSE-NEXT: movzbl %al, %eax
|
|
; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE-NEXT: movq -40(%rsp,%rax,8), %rdx
|
|
; SSE-NEXT: movq -48(%rsp,%rax,8), %r8
|
|
; SSE-NEXT: movq %r8, %r10
|
|
; SSE-NEXT: shrdq %cl, %rdx, %r10
|
|
; SSE-NEXT: movq -56(%rsp,%rax,8), %r11
|
|
; SSE-NEXT: movq %r11, %rbx
|
|
; SSE-NEXT: shrdq %cl, %r8, %rbx
|
|
; SSE-NEXT: movq -64(%rsp,%rax,8), %r8
|
|
; SSE-NEXT: shrq %cl, %rdx
|
|
; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE-NEXT: shrdq %cl, %r11, %r8
|
|
; SSE-NEXT: xorl %ecx, %ecx
|
|
; SSE-NEXT: orq %r9, %rsi
|
|
; SSE-NEXT: cmoveq %rcx, %rbx
|
|
; SSE-NEXT: cmoveq %rcx, %r10
|
|
; SSE-NEXT: cmoveq %rcx, %r8
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: cmoveq %rcx, %rdx
|
|
; SSE-NEXT: movq %rdx, 24(%rdi)
|
|
; SSE-NEXT: movq %r10, 16(%rdi)
|
|
; SSE-NEXT: movq %rbx, 8(%rdi)
|
|
; SSE-NEXT: movq %r8, (%rdi)
|
|
; SSE-NEXT: popq %rbx
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: isolate_msb_i256:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: pushq %rbx
|
|
; AVX2-NEXT: movq %rcx, %rax
|
|
; AVX2-NEXT: movq %rdx, %r9
|
|
; AVX2-NEXT: orq %r8, %r9
|
|
; AVX2-NEXT: xorl %ecx, %ecx
|
|
; AVX2-NEXT: lzcntq %rsi, %rcx
|
|
; AVX2-NEXT: orq %rax, %rsi
|
|
; AVX2-NEXT: lzcntq %r8, %r10
|
|
; AVX2-NEXT: lzcntq %rax, %r11
|
|
; AVX2-NEXT: addq $64, %r11
|
|
; AVX2-NEXT: testq %r8, %r8
|
|
; AVX2-NEXT: cmovneq %r10, %r11
|
|
; AVX2-NEXT: xorl %r10d, %r10d
|
|
; AVX2-NEXT: lzcntq %rdx, %r10
|
|
; AVX2-NEXT: addq $64, %rcx
|
|
; AVX2-NEXT: testq %rdx, %rdx
|
|
; AVX2-NEXT: cmovneq %r10, %rcx
|
|
; AVX2-NEXT: subq $-128, %rcx
|
|
; AVX2-NEXT: orq %r8, %rax
|
|
; AVX2-NEXT: cmovneq %r11, %rcx
|
|
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
|
|
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %eax
|
|
; AVX2-NEXT: movq -40(%rsp,%rax,8), %rdx
|
|
; AVX2-NEXT: movq -48(%rsp,%rax,8), %r8
|
|
; AVX2-NEXT: movq %r8, %r10
|
|
; AVX2-NEXT: shrdq %cl, %rdx, %r10
|
|
; AVX2-NEXT: movq -64(%rsp,%rax,8), %r11
|
|
; AVX2-NEXT: movq -56(%rsp,%rax,8), %rax
|
|
; AVX2-NEXT: movq %rax, %rbx
|
|
; AVX2-NEXT: shrdq %cl, %r8, %rbx
|
|
; AVX2-NEXT: shrdq %cl, %rax, %r11
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: xorl %edi, %edi
|
|
; AVX2-NEXT: orq %r9, %rsi
|
|
; AVX2-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX2-NEXT: cmoveq %rdi, %rbx
|
|
; AVX2-NEXT: cmoveq %rdi, %r10
|
|
; AVX2-NEXT: cmoveq %rdi, %r11
|
|
; AVX2-NEXT: cmoveq %rdi, %rcx
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: movq %r10, 16(%rax)
|
|
; AVX2-NEXT: movq %rbx, 8(%rax)
|
|
; AVX2-NEXT: movq %r11, (%rax)
|
|
; AVX2-NEXT: popq %rbx
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: isolate_msb_i256:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: pushq %rbx
|
|
; AVX512F-NEXT: movq %rcx, %rax
|
|
; AVX512F-NEXT: movq %rdx, %r9
|
|
; AVX512F-NEXT: orq %r8, %r9
|
|
; AVX512F-NEXT: lzcntq %rsi, %rcx
|
|
; AVX512F-NEXT: orq %rax, %rsi
|
|
; AVX512F-NEXT: lzcntq %r8, %r10
|
|
; AVX512F-NEXT: lzcntq %rax, %r11
|
|
; AVX512F-NEXT: addq $64, %r11
|
|
; AVX512F-NEXT: testq %r8, %r8
|
|
; AVX512F-NEXT: cmovneq %r10, %r11
|
|
; AVX512F-NEXT: lzcntq %rdx, %r10
|
|
; AVX512F-NEXT: addq $64, %rcx
|
|
; AVX512F-NEXT: testq %rdx, %rdx
|
|
; AVX512F-NEXT: cmovneq %r10, %rcx
|
|
; AVX512F-NEXT: subq $-128, %rcx
|
|
; AVX512F-NEXT: orq %r8, %rax
|
|
; AVX512F-NEXT: cmovneq %r11, %rcx
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,9223372036854775808,0,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %eax
|
|
; AVX512F-NEXT: movq -40(%rsp,%rax,8), %rdx
|
|
; AVX512F-NEXT: movq -48(%rsp,%rax,8), %r8
|
|
; AVX512F-NEXT: movq %r8, %r10
|
|
; AVX512F-NEXT: shrdq %cl, %rdx, %r10
|
|
; AVX512F-NEXT: movq -64(%rsp,%rax,8), %r11
|
|
; AVX512F-NEXT: movq -56(%rsp,%rax,8), %rax
|
|
; AVX512F-NEXT: movq %rax, %rbx
|
|
; AVX512F-NEXT: shrdq %cl, %r8, %rbx
|
|
; AVX512F-NEXT: shrdq %cl, %rax, %r11
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: xorl %edi, %edi
|
|
; AVX512F-NEXT: orq %r9, %rsi
|
|
; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX512F-NEXT: cmoveq %rdi, %rbx
|
|
; AVX512F-NEXT: cmoveq %rdi, %r10
|
|
; AVX512F-NEXT: cmoveq %rdi, %r11
|
|
; AVX512F-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: movq %r10, 16(%rax)
|
|
; AVX512F-NEXT: movq %rbx, 8(%rax)
|
|
; AVX512F-NEXT: movq %r11, (%rax)
|
|
; AVX512F-NEXT: popq %rbx
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: isolate_msb_i256:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: pushq %rbx
|
|
; AVX512VL-NEXT: movq %rcx, %rax
|
|
; AVX512VL-NEXT: movq %rdx, %r9
|
|
; AVX512VL-NEXT: orq %r8, %r9
|
|
; AVX512VL-NEXT: lzcntq %rsi, %rcx
|
|
; AVX512VL-NEXT: orq %rax, %rsi
|
|
; AVX512VL-NEXT: lzcntq %r8, %r10
|
|
; AVX512VL-NEXT: lzcntq %rax, %r11
|
|
; AVX512VL-NEXT: addq $64, %r11
|
|
; AVX512VL-NEXT: testq %r8, %r8
|
|
; AVX512VL-NEXT: cmovneq %r10, %r11
|
|
; AVX512VL-NEXT: lzcntq %rdx, %r10
|
|
; AVX512VL-NEXT: addq $64, %rcx
|
|
; AVX512VL-NEXT: testq %rdx, %rdx
|
|
; AVX512VL-NEXT: cmovneq %r10, %rcx
|
|
; AVX512VL-NEXT: subq $-128, %rcx
|
|
; AVX512VL-NEXT: orq %r8, %rax
|
|
; AVX512VL-NEXT: cmovneq %r11, %rcx
|
|
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
|
|
; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: movl %ecx, %eax
|
|
; AVX512VL-NEXT: shrb $6, %al
|
|
; AVX512VL-NEXT: movzbl %al, %eax
|
|
; AVX512VL-NEXT: movq -40(%rsp,%rax,8), %rdx
|
|
; AVX512VL-NEXT: movq -48(%rsp,%rax,8), %r8
|
|
; AVX512VL-NEXT: movq %r8, %r10
|
|
; AVX512VL-NEXT: shrdq %cl, %rdx, %r10
|
|
; AVX512VL-NEXT: movq -56(%rsp,%rax,8), %r11
|
|
; AVX512VL-NEXT: movq %r11, %rbx
|
|
; AVX512VL-NEXT: shrdq %cl, %r8, %rbx
|
|
; AVX512VL-NEXT: movq -64(%rsp,%rax,8), %r8
|
|
; AVX512VL-NEXT: shrdq %cl, %r11, %r8
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: xorl %edi, %edi
|
|
; AVX512VL-NEXT: orq %r9, %rsi
|
|
; AVX512VL-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rbx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VL-NEXT: movq %rbx, 8(%rax)
|
|
; AVX512VL-NEXT: movq %r8, (%rax)
|
|
; AVX512VL-NEXT: popq %rbx
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: isolate_msb_i256:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: pushq %rbx
|
|
; AVX512VBMI-NEXT: movq %rcx, %rax
|
|
; AVX512VBMI-NEXT: movq %rdx, %r9
|
|
; AVX512VBMI-NEXT: orq %r8, %r9
|
|
; AVX512VBMI-NEXT: lzcntq %rsi, %rcx
|
|
; AVX512VBMI-NEXT: orq %rax, %rsi
|
|
; AVX512VBMI-NEXT: lzcntq %r8, %r10
|
|
; AVX512VBMI-NEXT: lzcntq %rax, %r11
|
|
; AVX512VBMI-NEXT: addq $64, %r11
|
|
; AVX512VBMI-NEXT: testq %r8, %r8
|
|
; AVX512VBMI-NEXT: cmovneq %r10, %r11
|
|
; AVX512VBMI-NEXT: lzcntq %rdx, %r10
|
|
; AVX512VBMI-NEXT: addq $64, %rcx
|
|
; AVX512VBMI-NEXT: testq %rdx, %rdx
|
|
; AVX512VBMI-NEXT: cmovneq %r10, %rcx
|
|
; AVX512VBMI-NEXT: subq $-128, %rcx
|
|
; AVX512VBMI-NEXT: orq %r8, %rax
|
|
; AVX512VBMI-NEXT: cmovneq %r11, %rcx
|
|
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
|
|
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: movl %ecx, %eax
|
|
; AVX512VBMI-NEXT: shrb $6, %al
|
|
; AVX512VBMI-NEXT: movzbl %al, %eax
|
|
; AVX512VBMI-NEXT: movq -40(%rsp,%rax,8), %rdx
|
|
; AVX512VBMI-NEXT: movq -48(%rsp,%rax,8), %r8
|
|
; AVX512VBMI-NEXT: movq %r8, %r10
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r10
|
|
; AVX512VBMI-NEXT: movq -56(%rsp,%rax,8), %r11
|
|
; AVX512VBMI-NEXT: movq %r11, %rbx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r8, %rbx
|
|
; AVX512VBMI-NEXT: movq -64(%rsp,%rax,8), %r8
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r8
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: xorl %edi, %edi
|
|
; AVX512VBMI-NEXT: orq %r9, %rsi
|
|
; AVX512VBMI-NEXT: shrxq %rcx, %rdx, %rcx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rbx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %rbx, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %r8, (%rax)
|
|
; AVX512VBMI-NEXT: popq %rbx
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%eqz = icmp eq i256 %a0, 0
|
|
%clz = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
|
|
%bit = shl i256 1, 255
|
|
%msk = lshr i256 %bit, %clz
|
|
%res = select i1 %eqz, i256 0, i256 %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @isolate_msb_i256_vector(<4 x i64> %v0, i256 %idx) nounwind {
|
|
; SSE2-LABEL: isolate_msb_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %xmm0, %rcx
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rdx
|
|
; SSE2-NEXT: movq %xmm1, %rsi
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %r8
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
; SSE2-NEXT: pxor %xmm2, %xmm2
|
|
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
|
|
; SSE2-NEXT: movmskps %xmm0, %eax
|
|
; SSE2-NEXT: xorl $15, %eax
|
|
; SSE2-NEXT: bsrq %r8, %r9
|
|
; SSE2-NEXT: xorq $63, %r9
|
|
; SSE2-NEXT: bsrq %rsi, %rsi
|
|
; SSE2-NEXT: xorq $63, %rsi
|
|
; SSE2-NEXT: orq $64, %rsi
|
|
; SSE2-NEXT: testq %r8, %r8
|
|
; SSE2-NEXT: cmovneq %r9, %rsi
|
|
; SSE2-NEXT: bsrq %rdx, %r8
|
|
; SSE2-NEXT: xorq $63, %r8
|
|
; SSE2-NEXT: bsrq %rcx, %rcx
|
|
; SSE2-NEXT: xorq $63, %rcx
|
|
; SSE2-NEXT: orq $64, %rcx
|
|
; SSE2-NEXT: testq %rdx, %rdx
|
|
; SSE2-NEXT: cmovneq %r8, %rcx
|
|
; SSE2-NEXT: orq $128, %rcx
|
|
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
|
|
; SSE2-NEXT: movmskps %xmm1, %edx
|
|
; SSE2-NEXT: xorl $15, %edx
|
|
; SSE2-NEXT: cmovneq %rsi, %rcx
|
|
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
|
|
; SSE2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movl %ecx, %edx
|
|
; SSE2-NEXT: shrb $6, %dl
|
|
; SSE2-NEXT: movzbl %dl, %esi
|
|
; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdx
|
|
; SSE2-NEXT: movq -56(%rsp,%rsi,8), %r8
|
|
; SSE2-NEXT: movq %r8, %r9
|
|
; SSE2-NEXT: shrdq %cl, %rdx, %r9
|
|
; SSE2-NEXT: movq -64(%rsp,%rsi,8), %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: shrdq %cl, %r8, %r11
|
|
; SSE2-NEXT: movq -72(%rsp,%rsi,8), %rsi
|
|
; SSE2-NEXT: shrq %cl, %rdx
|
|
; SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE2-NEXT: shrdq %cl, %r10, %rsi
|
|
; SSE2-NEXT: xorl %ecx, %ecx
|
|
; SSE2-NEXT: testl %eax, %eax
|
|
; SSE2-NEXT: cmoveq %rcx, %r11
|
|
; SSE2-NEXT: cmoveq %rcx, %r9
|
|
; SSE2-NEXT: cmoveq %rcx, %rsi
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: cmoveq %rcx, %rdx
|
|
; SSE2-NEXT: movq %rdx, 24(%rdi)
|
|
; SSE2-NEXT: movq %r9, 16(%rdi)
|
|
; SSE2-NEXT: movq %r11, 8(%rdi)
|
|
; SSE2-NEXT: movq %rsi, (%rdi)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: isolate_msb_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
|
|
; SSE42-NEXT: movq %xmm0, %rcx
|
|
; SSE42-NEXT: movq %xmm1, %rax
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %rsi
|
|
; SSE42-NEXT: bsrq %rsi, %r8
|
|
; SSE42-NEXT: xorq $63, %r8
|
|
; SSE42-NEXT: bsrq %rax, %rax
|
|
; SSE42-NEXT: xorq $63, %rax
|
|
; SSE42-NEXT: orq $64, %rax
|
|
; SSE42-NEXT: testq %rsi, %rsi
|
|
; SSE42-NEXT: cmovneq %r8, %rax
|
|
; SSE42-NEXT: bsrq %rdx, %rsi
|
|
; SSE42-NEXT: xorq $63, %rsi
|
|
; SSE42-NEXT: bsrq %rcx, %rcx
|
|
; SSE42-NEXT: xorq $63, %rcx
|
|
; SSE42-NEXT: orq $64, %rcx
|
|
; SSE42-NEXT: testq %rdx, %rdx
|
|
; SSE42-NEXT: cmovneq %rsi, %rcx
|
|
; SSE42-NEXT: xorps %xmm2, %xmm2
|
|
; SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: orq $128, %rcx
|
|
; SSE42-NEXT: ptest %xmm1, %xmm1
|
|
; SSE42-NEXT: cmovneq %rax, %rcx
|
|
; SSE42-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
; SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movl %ecx, %eax
|
|
; SSE42-NEXT: shrb $6, %al
|
|
; SSE42-NEXT: movzbl %al, %eax
|
|
; SSE42-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movq -48(%rsp,%rax,8), %rdx
|
|
; SSE42-NEXT: movq -56(%rsp,%rax,8), %rsi
|
|
; SSE42-NEXT: movq %rsi, %r8
|
|
; SSE42-NEXT: shrdq %cl, %rdx, %r8
|
|
; SSE42-NEXT: movq -64(%rsp,%rax,8), %r9
|
|
; SSE42-NEXT: movq %r9, %r10
|
|
; SSE42-NEXT: shrdq %cl, %rsi, %r10
|
|
; SSE42-NEXT: movq -72(%rsp,%rax,8), %rsi
|
|
; SSE42-NEXT: shrq %cl, %rdx
|
|
; SSE42-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE42-NEXT: shrdq %cl, %r9, %rsi
|
|
; SSE42-NEXT: por %xmm1, %xmm0
|
|
; SSE42-NEXT: xorl %ecx, %ecx
|
|
; SSE42-NEXT: ptest %xmm0, %xmm0
|
|
; SSE42-NEXT: cmoveq %rcx, %r10
|
|
; SSE42-NEXT: cmoveq %rcx, %r8
|
|
; SSE42-NEXT: cmoveq %rcx, %rsi
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: cmoveq %rcx, %rdx
|
|
; SSE42-NEXT: movq %rdx, 24(%rdi)
|
|
; SSE42-NEXT: movq %r8, 16(%rdi)
|
|
; SSE42-NEXT: movq %r10, 8(%rdi)
|
|
; SSE42-NEXT: movq %rsi, (%rdi)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: isolate_msb_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vmovq %xmm0, %rax
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX2-NEXT: vmovq %xmm1, %rsi
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %r8
|
|
; AVX2-NEXT: lzcntq %r8, %rcx
|
|
; AVX2-NEXT: lzcntq %rsi, %r9
|
|
; AVX2-NEXT: addq $64, %r9
|
|
; AVX2-NEXT: testq %r8, %r8
|
|
; AVX2-NEXT: cmovneq %rcx, %r9
|
|
; AVX2-NEXT: lzcntq %rdx, %r10
|
|
; AVX2-NEXT: xorl %ecx, %ecx
|
|
; AVX2-NEXT: lzcntq %rax, %rcx
|
|
; AVX2-NEXT: addq $64, %rcx
|
|
; AVX2-NEXT: testq %rdx, %rdx
|
|
; AVX2-NEXT: cmovneq %r10, %rcx
|
|
; AVX2-NEXT: subq $-128, %rcx
|
|
; AVX2-NEXT: orq %r8, %rsi
|
|
; AVX2-NEXT: cmovneq %r9, %rcx
|
|
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
; AVX2-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %edx
|
|
; AVX2-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX2-NEXT: movq -56(%rsp,%rdx,8), %r8
|
|
; AVX2-NEXT: movq %r8, %r9
|
|
; AVX2-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -72(%rsp,%rdx,8), %rdi
|
|
; AVX2-NEXT: movq -64(%rsp,%rdx,8), %rdx
|
|
; AVX2-NEXT: movq %rdx, %r10
|
|
; AVX2-NEXT: shrdq %cl, %r8, %r10
|
|
; AVX2-NEXT: shrdq %cl, %rdx, %rdi
|
|
; AVX2-NEXT: xorl %edx, %edx
|
|
; AVX2-NEXT: vptest %ymm0, %ymm0
|
|
; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX2-NEXT: cmoveq %rdx, %r10
|
|
; AVX2-NEXT: cmoveq %rdx, %r9
|
|
; AVX2-NEXT: cmoveq %rdx, %rdi
|
|
; AVX2-NEXT: cmoveq %rdx, %rcx
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: movq %r9, 16(%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %rdi, (%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: isolate_msb_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,9223372036854775808,0,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
|
|
; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
|
|
; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
|
|
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
|
|
; AVX512F-NEXT: vpcompressq %zmm1, %zmm1 {%k1}
|
|
; AVX512F-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %edx
|
|
; AVX512F-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512F-NEXT: movq -56(%rsp,%rdx,8), %r8
|
|
; AVX512F-NEXT: movq %r8, %r9
|
|
; AVX512F-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
|
|
; AVX512F-NEXT: movq -72(%rsp,%rdx,8), %rdi
|
|
; AVX512F-NEXT: movq -64(%rsp,%rdx,8), %rdx
|
|
; AVX512F-NEXT: movq %rdx, %r10
|
|
; AVX512F-NEXT: shrdq %cl, %r8, %r10
|
|
; AVX512F-NEXT: shrdq %cl, %rdx, %rdi
|
|
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
; AVX512F-NEXT: xorl %edx, %edx
|
|
; AVX512F-NEXT: kortestw %k0, %k0
|
|
; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512F-NEXT: cmoveq %rdx, %r10
|
|
; AVX512F-NEXT: cmoveq %rdx, %r9
|
|
; AVX512F-NEXT: cmoveq %rdx, %rdi
|
|
; AVX512F-NEXT: cmoveq %rdx, %rcx
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: movq %r9, 16(%rax)
|
|
; AVX512F-NEXT: movq %r10, 8(%rax)
|
|
; AVX512F-NEXT: movq %rdi, (%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: isolate_msb_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512VL-NEXT: vptestmq %ymm1, %ymm1, %k1
|
|
; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
|
|
; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512VL-NEXT: vpcompressq %ymm1, %ymm1 {%k1}
|
|
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
|
; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
|
|
; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512VL-NEXT: movl %ecx, %eax
|
|
; AVX512VL-NEXT: shrb $6, %al
|
|
; AVX512VL-NEXT: movzbl %al, %edx
|
|
; AVX512VL-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512VL-NEXT: movq -56(%rsp,%rdx,8), %rax
|
|
; AVX512VL-NEXT: movq %rax, %r8
|
|
; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
|
|
; AVX512VL-NEXT: movq -64(%rsp,%rdx,8), %r9
|
|
; AVX512VL-NEXT: movq %r9, %r10
|
|
; AVX512VL-NEXT: shrdq %cl, %rax, %r10
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: movq -72(%rsp,%rdx,8), %rdx
|
|
; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
|
|
; AVX512VL-NEXT: xorl %edi, %edi
|
|
; AVX512VL-NEXT: vptest %ymm0, %ymm0
|
|
; AVX512VL-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rdx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VL-NEXT: movq %rdx, (%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: isolate_msb_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512VBMI-NEXT: vptestmq %ymm1, %ymm1, %k1
|
|
; AVX512VBMI-NEXT: vplzcntq %ymm1, %ymm1
|
|
; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512VBMI-NEXT: vpcompressq %ymm1, %ymm1 {%k1}
|
|
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
|
; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
|
|
; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512VBMI-NEXT: movl %ecx, %eax
|
|
; AVX512VBMI-NEXT: shrb $6, %al
|
|
; AVX512VBMI-NEXT: movzbl %al, %edx
|
|
; AVX512VBMI-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512VBMI-NEXT: movq -56(%rsp,%rdx,8), %rax
|
|
; AVX512VBMI-NEXT: movq %rax, %r8
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
|
|
; AVX512VBMI-NEXT: movq -64(%rsp,%rdx,8), %r9
|
|
; AVX512VBMI-NEXT: movq %r9, %r10
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rax, %r10
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: movq -72(%rsp,%rdx,8), %rdx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
|
|
; AVX512VBMI-NEXT: xorl %edi, %edi
|
|
; AVX512VBMI-NEXT: vptest %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rdx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %rdx, (%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%eqz = icmp eq i256 %a0, 0
|
|
%clz = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
|
|
%bit = shl i256 1, 255
|
|
%msk = lshr i256 %bit, %clz
|
|
%res = select i1 %eqz, i256 0, i256 %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
|
|
; SSE2-LABEL: isolate_msb_i256_load:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq 8(%rsi), %r9
|
|
; SSE2-NEXT: movq 16(%rsi), %rdx
|
|
; SSE2-NEXT: movq 24(%rsi), %r8
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm1
|
|
; SSE2-NEXT: por 16(%rsi), %xmm1
|
|
; SSE2-NEXT: pxor %xmm0, %xmm0
|
|
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
|
|
; SSE2-NEXT: movmskps %xmm1, %eax
|
|
; SSE2-NEXT: xorl $15, %eax
|
|
; SSE2-NEXT: bsrq %r8, %rcx
|
|
; SSE2-NEXT: xorq $63, %rcx
|
|
; SSE2-NEXT: bsrq %rdx, %r10
|
|
; SSE2-NEXT: xorq $63, %r10
|
|
; SSE2-NEXT: orq $64, %r10
|
|
; SSE2-NEXT: testq %r8, %r8
|
|
; SSE2-NEXT: cmovneq %rcx, %r10
|
|
; SSE2-NEXT: bsrq %r9, %r11
|
|
; SSE2-NEXT: xorq $63, %r11
|
|
; SSE2-NEXT: bsrq (%rsi), %rcx
|
|
; SSE2-NEXT: xorq $63, %rcx
|
|
; SSE2-NEXT: orq $64, %rcx
|
|
; SSE2-NEXT: testq %r9, %r9
|
|
; SSE2-NEXT: cmovneq %r11, %rcx
|
|
; SSE2-NEXT: orq $128, %rcx
|
|
; SSE2-NEXT: orq %r8, %rdx
|
|
; SSE2-NEXT: cmovneq %r10, %rcx
|
|
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
|
|
; SSE2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movl %ecx, %edx
|
|
; SSE2-NEXT: shrb $6, %dl
|
|
; SSE2-NEXT: movzbl %dl, %esi
|
|
; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdx
|
|
; SSE2-NEXT: movq -56(%rsp,%rsi,8), %r8
|
|
; SSE2-NEXT: movq %r8, %r9
|
|
; SSE2-NEXT: shrdq %cl, %rdx, %r9
|
|
; SSE2-NEXT: movq -64(%rsp,%rsi,8), %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: shrdq %cl, %r8, %r11
|
|
; SSE2-NEXT: movq -72(%rsp,%rsi,8), %rsi
|
|
; SSE2-NEXT: shrq %cl, %rdx
|
|
; SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE2-NEXT: shrdq %cl, %r10, %rsi
|
|
; SSE2-NEXT: xorl %ecx, %ecx
|
|
; SSE2-NEXT: testl %eax, %eax
|
|
; SSE2-NEXT: cmoveq %rcx, %r11
|
|
; SSE2-NEXT: cmoveq %rcx, %r9
|
|
; SSE2-NEXT: cmoveq %rcx, %rsi
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: cmoveq %rcx, %rdx
|
|
; SSE2-NEXT: movq %rdx, 24(%rdi)
|
|
; SSE2-NEXT: movq %r9, 16(%rdi)
|
|
; SSE2-NEXT: movq %r11, 8(%rdi)
|
|
; SSE2-NEXT: movq %rsi, (%rdi)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: isolate_msb_i256_load:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq 16(%rsi), %rax
|
|
; SSE42-NEXT: movq 24(%rsi), %rdx
|
|
; SSE42-NEXT: movdqa (%rsi), %xmm0
|
|
; SSE42-NEXT: por 16(%rsi), %xmm0
|
|
; SSE42-NEXT: bsrq %rdx, %rcx
|
|
; SSE42-NEXT: xorq $63, %rcx
|
|
; SSE42-NEXT: bsrq %rax, %r8
|
|
; SSE42-NEXT: xorq $63, %r8
|
|
; SSE42-NEXT: orq $64, %r8
|
|
; SSE42-NEXT: testq %rdx, %rdx
|
|
; SSE42-NEXT: cmovneq %rcx, %r8
|
|
; SSE42-NEXT: movq 8(%rsi), %r9
|
|
; SSE42-NEXT: bsrq %r9, %r10
|
|
; SSE42-NEXT: bsrq (%rsi), %rcx
|
|
; SSE42-NEXT: xorq $63, %r10
|
|
; SSE42-NEXT: xorq $63, %rcx
|
|
; SSE42-NEXT: orq $64, %rcx
|
|
; SSE42-NEXT: testq %r9, %r9
|
|
; SSE42-NEXT: cmovneq %r10, %rcx
|
|
; SSE42-NEXT: orq $128, %rcx
|
|
; SSE42-NEXT: orq %rdx, %rax
|
|
; SSE42-NEXT: cmovneq %r8, %rcx
|
|
; SSE42-NEXT: xorps %xmm1, %xmm1
|
|
; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
|
|
; SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movl %ecx, %eax
|
|
; SSE42-NEXT: shrb $6, %al
|
|
; SSE42-NEXT: movzbl %al, %eax
|
|
; SSE42-NEXT: movq $0, -{{[0-9]+}}(%rsp)
|
|
; SSE42-NEXT: movq -48(%rsp,%rax,8), %rdx
|
|
; SSE42-NEXT: movq -56(%rsp,%rax,8), %rsi
|
|
; SSE42-NEXT: movq %rsi, %r8
|
|
; SSE42-NEXT: shrdq %cl, %rdx, %r8
|
|
; SSE42-NEXT: movq -64(%rsp,%rax,8), %r9
|
|
; SSE42-NEXT: movq %r9, %r10
|
|
; SSE42-NEXT: shrdq %cl, %rsi, %r10
|
|
; SSE42-NEXT: movq -72(%rsp,%rax,8), %rsi
|
|
; SSE42-NEXT: shrq %cl, %rdx
|
|
; SSE42-NEXT: # kill: def $cl killed $cl killed $rcx
|
|
; SSE42-NEXT: shrdq %cl, %r9, %rsi
|
|
; SSE42-NEXT: xorl %ecx, %ecx
|
|
; SSE42-NEXT: ptest %xmm0, %xmm0
|
|
; SSE42-NEXT: cmoveq %rcx, %r10
|
|
; SSE42-NEXT: cmoveq %rcx, %r8
|
|
; SSE42-NEXT: cmoveq %rcx, %rsi
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: cmoveq %rcx, %rdx
|
|
; SSE42-NEXT: movq %rdx, 24(%rdi)
|
|
; SSE42-NEXT: movq %r8, 16(%rdi)
|
|
; SSE42-NEXT: movq %r10, 8(%rdi)
|
|
; SSE42-NEXT: movq %rsi, (%rdi)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: isolate_msb_i256_load:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: movq 16(%rsi), %rax
|
|
; AVX2-NEXT: movq 24(%rsi), %rdx
|
|
; AVX2-NEXT: lzcntq %rdx, %rcx
|
|
; AVX2-NEXT: lzcntq %rax, %r8
|
|
; AVX2-NEXT: addq $64, %r8
|
|
; AVX2-NEXT: testq %rdx, %rdx
|
|
; AVX2-NEXT: cmovneq %rcx, %r8
|
|
; AVX2-NEXT: movq 8(%rsi), %r9
|
|
; AVX2-NEXT: lzcntq %r9, %r10
|
|
; AVX2-NEXT: xorl %ecx, %ecx
|
|
; AVX2-NEXT: lzcntq (%rsi), %rcx
|
|
; AVX2-NEXT: addq $64, %rcx
|
|
; AVX2-NEXT: testq %r9, %r9
|
|
; AVX2-NEXT: cmovneq %r10, %rcx
|
|
; AVX2-NEXT: subq $-128, %rcx
|
|
; AVX2-NEXT: orq %rdx, %rax
|
|
; AVX2-NEXT: cmovneq %r8, %rcx
|
|
; AVX2-NEXT: vmovdqu (%rsi), %ymm0
|
|
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
|
|
; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
|
|
; AVX2-NEXT: movl %ecx, %eax
|
|
; AVX2-NEXT: shrb $6, %al
|
|
; AVX2-NEXT: movzbl %al, %edx
|
|
; AVX2-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX2-NEXT: movq -56(%rsp,%rdx,8), %r8
|
|
; AVX2-NEXT: movq %r8, %r9
|
|
; AVX2-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: movq -72(%rsp,%rdx,8), %rdi
|
|
; AVX2-NEXT: movq -64(%rsp,%rdx,8), %rdx
|
|
; AVX2-NEXT: movq %rdx, %r10
|
|
; AVX2-NEXT: shrdq %cl, %r8, %r10
|
|
; AVX2-NEXT: shrdq %cl, %rdx, %rdi
|
|
; AVX2-NEXT: xorl %edx, %edx
|
|
; AVX2-NEXT: vptest %ymm0, %ymm0
|
|
; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX2-NEXT: cmoveq %rdx, %r10
|
|
; AVX2-NEXT: cmoveq %rdx, %r9
|
|
; AVX2-NEXT: cmoveq %rdx, %rdi
|
|
; AVX2-NEXT: cmoveq %rdx, %rcx
|
|
; AVX2-NEXT: movq %rcx, 24(%rax)
|
|
; AVX2-NEXT: movq %r9, 16(%rax)
|
|
; AVX2-NEXT: movq %r10, 8(%rax)
|
|
; AVX2-NEXT: movq %rdi, (%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: isolate_msb_i256_load:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vmovdqu (%rsi), %ymm0
|
|
; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,9223372036854775808,0,0,0,0]
|
|
; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
|
|
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
|
|
; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
|
|
; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
|
|
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
|
|
; AVX512F-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
|
|
; AVX512F-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512F-NEXT: movl %ecx, %eax
|
|
; AVX512F-NEXT: shrb $6, %al
|
|
; AVX512F-NEXT: movzbl %al, %edx
|
|
; AVX512F-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512F-NEXT: movq -56(%rsp,%rdx,8), %r8
|
|
; AVX512F-NEXT: movq %r8, %r9
|
|
; AVX512F-NEXT: shrdq %cl, %rsi, %r9
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: movq -72(%rsp,%rdx,8), %rdi
|
|
; AVX512F-NEXT: movq -64(%rsp,%rdx,8), %rdx
|
|
; AVX512F-NEXT: movq %rdx, %r10
|
|
; AVX512F-NEXT: shrdq %cl, %r8, %r10
|
|
; AVX512F-NEXT: shrdq %cl, %rdx, %rdi
|
|
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
|
|
; AVX512F-NEXT: xorl %edx, %edx
|
|
; AVX512F-NEXT: kortestw %k0, %k0
|
|
; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512F-NEXT: cmoveq %rdx, %r10
|
|
; AVX512F-NEXT: cmoveq %rdx, %r9
|
|
; AVX512F-NEXT: cmoveq %rdx, %rdi
|
|
; AVX512F-NEXT: cmoveq %rdx, %rcx
|
|
; AVX512F-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512F-NEXT: movq %r9, 16(%rax)
|
|
; AVX512F-NEXT: movq %r10, 8(%rax)
|
|
; AVX512F-NEXT: movq %rdi, (%rax)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: isolate_msb_i256_load:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: vmovdqu (%rsi), %ymm0
|
|
; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512VL-NEXT: vptestmq %ymm1, %ymm1, %k1
|
|
; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
|
|
; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512VL-NEXT: vpcompressq %ymm1, %ymm1 {%k1} {z}
|
|
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
|
; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
|
|
; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VL-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512VL-NEXT: movl %ecx, %eax
|
|
; AVX512VL-NEXT: shrb $6, %al
|
|
; AVX512VL-NEXT: movzbl %al, %edx
|
|
; AVX512VL-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512VL-NEXT: movq -56(%rsp,%rdx,8), %rax
|
|
; AVX512VL-NEXT: movq %rax, %r8
|
|
; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
|
|
; AVX512VL-NEXT: movq -64(%rsp,%rdx,8), %r9
|
|
; AVX512VL-NEXT: movq %r9, %r10
|
|
; AVX512VL-NEXT: shrdq %cl, %rax, %r10
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: movq -72(%rsp,%rdx,8), %rdx
|
|
; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
|
|
; AVX512VL-NEXT: xorl %edi, %edi
|
|
; AVX512VL-NEXT: vptest %ymm0, %ymm0
|
|
; AVX512VL-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VL-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rdx
|
|
; AVX512VL-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VL-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VL-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VL-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VL-NEXT: movq %rdx, (%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: isolate_msb_i256_load:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: vmovdqu (%rsi), %ymm0
|
|
; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
|
|
; AVX512VBMI-NEXT: vptestmq %ymm1, %ymm1, %k1
|
|
; AVX512VBMI-NEXT: vplzcntq %ymm1, %ymm1
|
|
; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,64,128,192]
|
|
; AVX512VBMI-NEXT: vpcompressq %ymm1, %ymm1 {%k1} {z}
|
|
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
|
; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
|
|
; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
|
|
; AVX512VBMI-NEXT: vmovd %xmm1, %ecx
|
|
; AVX512VBMI-NEXT: movl %ecx, %eax
|
|
; AVX512VBMI-NEXT: shrb $6, %al
|
|
; AVX512VBMI-NEXT: movzbl %al, %edx
|
|
; AVX512VBMI-NEXT: movq -48(%rsp,%rdx,8), %rsi
|
|
; AVX512VBMI-NEXT: movq -56(%rsp,%rdx,8), %rax
|
|
; AVX512VBMI-NEXT: movq %rax, %r8
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
|
|
; AVX512VBMI-NEXT: movq -64(%rsp,%rdx,8), %r9
|
|
; AVX512VBMI-NEXT: movq %r9, %r10
|
|
; AVX512VBMI-NEXT: shrdq %cl, %rax, %r10
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: movq -72(%rsp,%rdx,8), %rdx
|
|
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
|
|
; AVX512VBMI-NEXT: xorl %edi, %edi
|
|
; AVX512VBMI-NEXT: vptest %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: shrxq %rcx, %rsi, %rcx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r10
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %r8
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rdx
|
|
; AVX512VBMI-NEXT: cmoveq %rdi, %rcx
|
|
; AVX512VBMI-NEXT: movq %rcx, 24(%rax)
|
|
; AVX512VBMI-NEXT: movq %r8, 16(%rax)
|
|
; AVX512VBMI-NEXT: movq %r10, 8(%rax)
|
|
; AVX512VBMI-NEXT: movq %rdx, (%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%eqz = icmp eq i256 %a0, 0
|
|
%clz = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
|
|
%bit = shl i256 1, 255
|
|
%msk = lshr i256 %bit, %clz
|
|
%res = select i1 %eqz, i256 0, i256 %msk
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bswap_i256(i256 %a0) nounwind {
|
|
; SSE-LABEL: bswap_i256:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: bswapq %r8
|
|
; SSE-NEXT: bswapq %rcx
|
|
; SSE-NEXT: bswapq %rdx
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: bswapq %rsi
|
|
; SSE-NEXT: movq %rsi, 24(%rdi)
|
|
; SSE-NEXT: movq %rdx, 16(%rdi)
|
|
; SSE-NEXT: movq %rcx, 8(%rdi)
|
|
; SSE-NEXT: movq %r8, (%rdi)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: bswap_i256:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movbeq %rsi, 24(%rdi)
|
|
; AVX-NEXT: movbeq %rdx, 16(%rdi)
|
|
; AVX-NEXT: movbeq %rcx, 8(%rdi)
|
|
; AVX-NEXT: movbeq %r8, (%rdi)
|
|
; AVX-NEXT: retq
|
|
%res = call i256 @llvm.bswap.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bswap_i256_vector(<4 x i64> %v0) nounwind {
|
|
; SSE2-LABEL: bswap_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm2, %rcx
|
|
; SSE2-NEXT: movq %xmm1, %rdx
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm1, %rsi
|
|
; SSE2-NEXT: bswapq %rsi
|
|
; SSE2-NEXT: bswapq %rdx
|
|
; SSE2-NEXT: bswapq %rcx
|
|
; SSE2-NEXT: movq %xmm0, %rdi
|
|
; SSE2-NEXT: bswapq %rdi
|
|
; SSE2-NEXT: movq %rdi, 24(%rax)
|
|
; SSE2-NEXT: movq %rcx, 16(%rax)
|
|
; SSE2-NEXT: movq %rdx, 8(%rax)
|
|
; SSE2-NEXT: movq %rsi, (%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: bswap_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: pextrq $1, %xmm0, %rcx
|
|
; SSE42-NEXT: movq %xmm1, %rdx
|
|
; SSE42-NEXT: pextrq $1, %xmm1, %rsi
|
|
; SSE42-NEXT: bswapq %rsi
|
|
; SSE42-NEXT: bswapq %rdx
|
|
; SSE42-NEXT: bswapq %rcx
|
|
; SSE42-NEXT: movq %xmm0, %rdi
|
|
; SSE42-NEXT: bswapq %rdi
|
|
; SSE42-NEXT: movq %rdi, 24(%rax)
|
|
; SSE42-NEXT: movq %rcx, 16(%rax)
|
|
; SSE42-NEXT: movq %rdx, 8(%rax)
|
|
; SSE42-NEXT: movq %rsi, (%rax)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bswap_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
|
|
; AVX2-NEXT: vmovq %xmm0, %rsi
|
|
; AVX2-NEXT: movbeq %rsi, 24(%rdi)
|
|
; AVX2-NEXT: movbeq %rdx, 16(%rdi)
|
|
; AVX2-NEXT: vmovq %xmm1, %rdx
|
|
; AVX2-NEXT: movbeq %rdx, 8(%rdi)
|
|
; AVX2-NEXT: movbeq %rcx, (%rdi)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bswap_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
|
|
; AVX512F-NEXT: vmovq %xmm0, %rsi
|
|
; AVX512F-NEXT: movbeq %rsi, 24(%rdi)
|
|
; AVX512F-NEXT: movbeq %rdx, 16(%rdi)
|
|
; AVX512F-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512F-NEXT: movbeq %rdx, 8(%rdi)
|
|
; AVX512F-NEXT: movbeq %rcx, (%rdi)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bswap_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VL-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VL-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VL-NEXT: movbeq %rdi, 24(%rax)
|
|
; AVX512VL-NEXT: movbeq %rsi, 16(%rax)
|
|
; AVX512VL-NEXT: movbeq %rdx, 8(%rax)
|
|
; AVX512VL-NEXT: movbeq %rcx, (%rax)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bswap_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rcx
|
|
; AVX512VBMI-NEXT: vmovq %xmm1, %rdx
|
|
; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
|
|
; AVX512VBMI-NEXT: vmovq %xmm0, %rdi
|
|
; AVX512VBMI-NEXT: movbeq %rdi, 24(%rax)
|
|
; AVX512VBMI-NEXT: movbeq %rsi, 16(%rax)
|
|
; AVX512VBMI-NEXT: movbeq %rdx, 8(%rax)
|
|
; AVX512VBMI-NEXT: movbeq %rcx, (%rax)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%res = call i256 @llvm.bswap.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bswap_i256_load(ptr %p0) nounwind {
|
|
; SSE-LABEL: bswap_i256_load:
|
|
; SSE: # %bb.0:
|
|
; SSE-NEXT: movq %rdi, %rax
|
|
; SSE-NEXT: movq 8(%rsi), %rcx
|
|
; SSE-NEXT: movq 16(%rsi), %rdx
|
|
; SSE-NEXT: movq 24(%rsi), %rdi
|
|
; SSE-NEXT: bswapq %rdi
|
|
; SSE-NEXT: bswapq %rdx
|
|
; SSE-NEXT: bswapq %rcx
|
|
; SSE-NEXT: movq (%rsi), %rsi
|
|
; SSE-NEXT: bswapq %rsi
|
|
; SSE-NEXT: movq %rsi, 24(%rax)
|
|
; SSE-NEXT: movq %rcx, 16(%rax)
|
|
; SSE-NEXT: movq %rdx, 8(%rax)
|
|
; SSE-NEXT: movq %rdi, (%rax)
|
|
; SSE-NEXT: retq
|
|
;
|
|
; AVX-LABEL: bswap_i256_load:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: movq %rdi, %rax
|
|
; AVX-NEXT: movq 24(%rsi), %rcx
|
|
; AVX-NEXT: movq 16(%rsi), %rdx
|
|
; AVX-NEXT: movq (%rsi), %rdi
|
|
; AVX-NEXT: movq 8(%rsi), %rsi
|
|
; AVX-NEXT: movbeq %rdi, 24(%rax)
|
|
; AVX-NEXT: movbeq %rsi, 16(%rax)
|
|
; AVX-NEXT: movbeq %rdx, 8(%rax)
|
|
; AVX-NEXT: movbeq %rcx, (%rax)
|
|
; AVX-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%res = call i256 @llvm.bswap.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bitreverse_i256(i256 %a0) nounwind {
|
|
; SSE2-LABEL: bitreverse_i256:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: bswapq %r8
|
|
; SSE2-NEXT: movq %r8, %rdi
|
|
; SSE2-NEXT: shrq $4, %rdi
|
|
; SSE2-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
|
|
; SSE2-NEXT: andq %r9, %rdi
|
|
; SSE2-NEXT: andq %r9, %r8
|
|
; SSE2-NEXT: shlq $4, %r8
|
|
; SSE2-NEXT: orq %rdi, %r8
|
|
; SSE2-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
|
|
; SSE2-NEXT: movq %r8, %r10
|
|
; SSE2-NEXT: andq %rdi, %r10
|
|
; SSE2-NEXT: shrq $2, %r8
|
|
; SSE2-NEXT: andq %rdi, %r8
|
|
; SSE2-NEXT: leaq (%r8,%r10,4), %r10
|
|
; SSE2-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: andq %r8, %r11
|
|
; SSE2-NEXT: shrq %r10
|
|
; SSE2-NEXT: andq %r8, %r10
|
|
; SSE2-NEXT: leaq (%r10,%r11,2), %r10
|
|
; SSE2-NEXT: bswapq %rcx
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %r9, %r11
|
|
; SSE2-NEXT: andq %r9, %rcx
|
|
; SSE2-NEXT: shlq $4, %rcx
|
|
; SSE2-NEXT: orq %r11, %rcx
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq $2, %rcx
|
|
; SSE2-NEXT: andq %rdi, %rcx
|
|
; SSE2-NEXT: leaq (%rcx,%r11,4), %rcx
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: andq %r8, %r11
|
|
; SSE2-NEXT: shrq %rcx
|
|
; SSE2-NEXT: andq %r8, %rcx
|
|
; SSE2-NEXT: bswapq %rdx
|
|
; SSE2-NEXT: leaq (%rcx,%r11,2), %rcx
|
|
; SSE2-NEXT: movq %rdx, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %r9, %r11
|
|
; SSE2-NEXT: andq %r9, %rdx
|
|
; SSE2-NEXT: shlq $4, %rdx
|
|
; SSE2-NEXT: orq %r11, %rdx
|
|
; SSE2-NEXT: movq %rdx, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq $2, %rdx
|
|
; SSE2-NEXT: andq %rdi, %rdx
|
|
; SSE2-NEXT: leaq (%rdx,%r11,4), %rdx
|
|
; SSE2-NEXT: movq %rdx, %r11
|
|
; SSE2-NEXT: andq %r8, %r11
|
|
; SSE2-NEXT: shrq %rdx
|
|
; SSE2-NEXT: andq %r8, %rdx
|
|
; SSE2-NEXT: leaq (%rdx,%r11,2), %rdx
|
|
; SSE2-NEXT: bswapq %rsi
|
|
; SSE2-NEXT: movq %rsi, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %r9, %r11
|
|
; SSE2-NEXT: andq %r9, %rsi
|
|
; SSE2-NEXT: shlq $4, %rsi
|
|
; SSE2-NEXT: orq %r11, %rsi
|
|
; SSE2-NEXT: movq %rsi, %r9
|
|
; SSE2-NEXT: andq %rdi, %r9
|
|
; SSE2-NEXT: shrq $2, %rsi
|
|
; SSE2-NEXT: andq %rdi, %rsi
|
|
; SSE2-NEXT: leaq (%rsi,%r9,4), %rsi
|
|
; SSE2-NEXT: movq %rsi, %rdi
|
|
; SSE2-NEXT: andq %r8, %rdi
|
|
; SSE2-NEXT: shrq %rsi
|
|
; SSE2-NEXT: andq %r8, %rsi
|
|
; SSE2-NEXT: leaq (%rsi,%rdi,2), %rsi
|
|
; SSE2-NEXT: movq %rsi, 24(%rax)
|
|
; SSE2-NEXT: movq %rdx, 16(%rax)
|
|
; SSE2-NEXT: movq %rcx, 8(%rax)
|
|
; SSE2-NEXT: movq %r10, (%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: bitreverse_i256:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: movq %rcx, %xmm0
|
|
; SSE42-NEXT: movq %r8, %xmm1
|
|
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
|
|
; SSE42-NEXT: pshufb %xmm0, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; SSE42-NEXT: movdqa %xmm1, %xmm3
|
|
; SSE42-NEXT: pand %xmm2, %xmm3
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; SSE42-NEXT: movdqa %xmm4, %xmm5
|
|
; SSE42-NEXT: pshufb %xmm3, %xmm5
|
|
; SSE42-NEXT: psrlw $4, %xmm1
|
|
; SSE42-NEXT: pand %xmm2, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; SSE42-NEXT: movdqa %xmm3, %xmm6
|
|
; SSE42-NEXT: pshufb %xmm1, %xmm6
|
|
; SSE42-NEXT: por %xmm5, %xmm6
|
|
; SSE42-NEXT: movq %rsi, %xmm1
|
|
; SSE42-NEXT: movq %rdx, %xmm5
|
|
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
|
|
; SSE42-NEXT: pshufb %xmm0, %xmm5
|
|
; SSE42-NEXT: movdqa %xmm5, %xmm0
|
|
; SSE42-NEXT: pand %xmm2, %xmm0
|
|
; SSE42-NEXT: pshufb %xmm0, %xmm4
|
|
; SSE42-NEXT: psrlw $4, %xmm5
|
|
; SSE42-NEXT: pand %xmm2, %xmm5
|
|
; SSE42-NEXT: pshufb %xmm5, %xmm3
|
|
; SSE42-NEXT: por %xmm4, %xmm3
|
|
; SSE42-NEXT: movdqa %xmm3, 16(%rdi)
|
|
; SSE42-NEXT: movdqa %xmm6, (%rdi)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bitreverse_i256:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vmovq %rsi, %xmm0
|
|
; AVX2-NEXT: vmovq %rdx, %xmm1
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
; AVX2-NEXT: vmovq %rcx, %xmm1
|
|
; AVX2-NEXT: vmovq %r8, %xmm2
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bitreverse_i256:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vmovq %rsi, %xmm0
|
|
; AVX512F-NEXT: vmovq %rdx, %xmm1
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
; AVX512F-NEXT: vmovq %rcx, %xmm1
|
|
; AVX512F-NEXT: vmovq %r8, %xmm2
|
|
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bitreverse_i256:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vmovq %rsi, %xmm0
|
|
; AVX512VL-NEXT: vmovq %rdx, %xmm1
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
; AVX512VL-NEXT: vmovq %rcx, %xmm1
|
|
; AVX512VL-NEXT: vmovq %r8, %xmm2
|
|
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bitreverse_i256:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vmovq %rsi, %xmm0
|
|
; AVX512VBMI-NEXT: vmovq %rdx, %xmm1
|
|
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
; AVX512VBMI-NEXT: vmovq %rcx, %xmm1
|
|
; AVX512VBMI-NEXT: vmovq %r8, %xmm2
|
|
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
; AVX512VBMI-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VBMI-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VBMI-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VBMI-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VBMI-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VBMI-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%res = call i256 @llvm.bitreverse.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bitreverse_i256_vector(<4 x i64> %v0) nounwind {
|
|
; SSE2-LABEL: bitreverse_i256_vector:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: movq %xmm0, %rcx
|
|
; SSE2-NEXT: movq %xmm1, %r9
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm0, %r10
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
|
|
; SSE2-NEXT: movq %xmm0, %rdi
|
|
; SSE2-NEXT: bswapq %rdi
|
|
; SSE2-NEXT: movq %rdi, %rdx
|
|
; SSE2-NEXT: shrq $4, %rdx
|
|
; SSE2-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
|
|
; SSE2-NEXT: andq %rsi, %rdx
|
|
; SSE2-NEXT: andq %rsi, %rdi
|
|
; SSE2-NEXT: shlq $4, %rdi
|
|
; SSE2-NEXT: orq %rdx, %rdi
|
|
; SSE2-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
|
|
; SSE2-NEXT: movq %rdi, %r8
|
|
; SSE2-NEXT: andq %rdx, %r8
|
|
; SSE2-NEXT: shrq $2, %rdi
|
|
; SSE2-NEXT: andq %rdx, %rdi
|
|
; SSE2-NEXT: leaq (%rdi,%r8,4), %r8
|
|
; SSE2-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
|
|
; SSE2-NEXT: movq %r8, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r8
|
|
; SSE2-NEXT: andq %rdi, %r8
|
|
; SSE2-NEXT: leaq (%r8,%r11,2), %r8
|
|
; SSE2-NEXT: bswapq %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %r10
|
|
; SSE2-NEXT: shlq $4, %r10
|
|
; SSE2-NEXT: orq %r11, %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: andq %rdx, %r11
|
|
; SSE2-NEXT: shrq $2, %r10
|
|
; SSE2-NEXT: andq %rdx, %r10
|
|
; SSE2-NEXT: leaq (%r10,%r11,4), %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r10
|
|
; SSE2-NEXT: andq %rdi, %r10
|
|
; SSE2-NEXT: bswapq %r9
|
|
; SSE2-NEXT: leaq (%r10,%r11,2), %r10
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %r9
|
|
; SSE2-NEXT: shlq $4, %r9
|
|
; SSE2-NEXT: orq %r11, %r9
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: andq %rdx, %r11
|
|
; SSE2-NEXT: shrq $2, %r9
|
|
; SSE2-NEXT: andq %rdx, %r9
|
|
; SSE2-NEXT: leaq (%r9,%r11,4), %r9
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r9
|
|
; SSE2-NEXT: andq %rdi, %r9
|
|
; SSE2-NEXT: leaq (%r9,%r11,2), %r9
|
|
; SSE2-NEXT: bswapq %rcx
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %rcx
|
|
; SSE2-NEXT: shlq $4, %rcx
|
|
; SSE2-NEXT: orq %r11, %rcx
|
|
; SSE2-NEXT: movq %rcx, %rsi
|
|
; SSE2-NEXT: andq %rdx, %rsi
|
|
; SSE2-NEXT: shrq $2, %rcx
|
|
; SSE2-NEXT: andq %rdx, %rcx
|
|
; SSE2-NEXT: leaq (%rcx,%rsi,4), %rcx
|
|
; SSE2-NEXT: movq %rcx, %rdx
|
|
; SSE2-NEXT: andq %rdi, %rdx
|
|
; SSE2-NEXT: shrq %rcx
|
|
; SSE2-NEXT: andq %rdi, %rcx
|
|
; SSE2-NEXT: leaq (%rcx,%rdx,2), %rcx
|
|
; SSE2-NEXT: movq %rcx, 24(%rax)
|
|
; SSE2-NEXT: movq %r9, 8(%rax)
|
|
; SSE2-NEXT: movq %r10, 16(%rax)
|
|
; SSE2-NEXT: movq %r8, (%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: bitreverse_i256_vector:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
|
|
; SSE42-NEXT: pshufb %xmm2, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; SSE42-NEXT: movdqa %xmm1, %xmm4
|
|
; SSE42-NEXT: pand %xmm3, %xmm4
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; SSE42-NEXT: movdqa %xmm5, %xmm6
|
|
; SSE42-NEXT: pshufb %xmm4, %xmm6
|
|
; SSE42-NEXT: psrlw $4, %xmm1
|
|
; SSE42-NEXT: pand %xmm3, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; SSE42-NEXT: movdqa %xmm4, %xmm7
|
|
; SSE42-NEXT: pshufb %xmm1, %xmm7
|
|
; SSE42-NEXT: por %xmm6, %xmm7
|
|
; SSE42-NEXT: pshufb %xmm2, %xmm0
|
|
; SSE42-NEXT: movdqa %xmm0, %xmm1
|
|
; SSE42-NEXT: pand %xmm3, %xmm1
|
|
; SSE42-NEXT: pshufb %xmm1, %xmm5
|
|
; SSE42-NEXT: psrlw $4, %xmm0
|
|
; SSE42-NEXT: pand %xmm3, %xmm0
|
|
; SSE42-NEXT: pshufb %xmm0, %xmm4
|
|
; SSE42-NEXT: por %xmm5, %xmm4
|
|
; SSE42-NEXT: movdqa %xmm4, 16(%rdi)
|
|
; SSE42-NEXT: movdqa %xmm7, (%rdi)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bitreverse_i256_vector:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bitreverse_i256_vector:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
|
|
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bitreverse_i256_vector:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
|
|
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bitreverse_i256_vector:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
|
|
; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VBMI-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VBMI-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VBMI-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VBMI-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VBMI-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = bitcast <4 x i64> %v0 to i256
|
|
%res = call i256 @llvm.bitreverse.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
define i256 @bitreverse_i256_load(ptr %p0) nounwind {
|
|
; SSE2-LABEL: bitreverse_i256_load:
|
|
; SSE2: # %bb.0:
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
; SSE2-NEXT: movq (%rsi), %rcx
|
|
; SSE2-NEXT: movq 8(%rsi), %r9
|
|
; SSE2-NEXT: movq 16(%rsi), %r10
|
|
; SSE2-NEXT: movq 24(%rsi), %rdi
|
|
; SSE2-NEXT: bswapq %rdi
|
|
; SSE2-NEXT: movq %rdi, %rdx
|
|
; SSE2-NEXT: shrq $4, %rdx
|
|
; SSE2-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
|
|
; SSE2-NEXT: andq %rsi, %rdx
|
|
; SSE2-NEXT: andq %rsi, %rdi
|
|
; SSE2-NEXT: shlq $4, %rdi
|
|
; SSE2-NEXT: orq %rdx, %rdi
|
|
; SSE2-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
|
|
; SSE2-NEXT: movq %rdi, %r8
|
|
; SSE2-NEXT: andq %rdx, %r8
|
|
; SSE2-NEXT: shrq $2, %rdi
|
|
; SSE2-NEXT: andq %rdx, %rdi
|
|
; SSE2-NEXT: leaq (%rdi,%r8,4), %r8
|
|
; SSE2-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
|
|
; SSE2-NEXT: movq %r8, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r8
|
|
; SSE2-NEXT: andq %rdi, %r8
|
|
; SSE2-NEXT: leaq (%r8,%r11,2), %r8
|
|
; SSE2-NEXT: bswapq %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %r10
|
|
; SSE2-NEXT: shlq $4, %r10
|
|
; SSE2-NEXT: orq %r11, %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: andq %rdx, %r11
|
|
; SSE2-NEXT: shrq $2, %r10
|
|
; SSE2-NEXT: andq %rdx, %r10
|
|
; SSE2-NEXT: leaq (%r10,%r11,4), %r10
|
|
; SSE2-NEXT: movq %r10, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r10
|
|
; SSE2-NEXT: andq %rdi, %r10
|
|
; SSE2-NEXT: bswapq %r9
|
|
; SSE2-NEXT: leaq (%r10,%r11,2), %r10
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %r9
|
|
; SSE2-NEXT: shlq $4, %r9
|
|
; SSE2-NEXT: orq %r11, %r9
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: andq %rdx, %r11
|
|
; SSE2-NEXT: shrq $2, %r9
|
|
; SSE2-NEXT: andq %rdx, %r9
|
|
; SSE2-NEXT: leaq (%r9,%r11,4), %r9
|
|
; SSE2-NEXT: movq %r9, %r11
|
|
; SSE2-NEXT: andq %rdi, %r11
|
|
; SSE2-NEXT: shrq %r9
|
|
; SSE2-NEXT: andq %rdi, %r9
|
|
; SSE2-NEXT: leaq (%r9,%r11,2), %r9
|
|
; SSE2-NEXT: bswapq %rcx
|
|
; SSE2-NEXT: movq %rcx, %r11
|
|
; SSE2-NEXT: shrq $4, %r11
|
|
; SSE2-NEXT: andq %rsi, %r11
|
|
; SSE2-NEXT: andq %rsi, %rcx
|
|
; SSE2-NEXT: shlq $4, %rcx
|
|
; SSE2-NEXT: orq %r11, %rcx
|
|
; SSE2-NEXT: movq %rcx, %rsi
|
|
; SSE2-NEXT: andq %rdx, %rsi
|
|
; SSE2-NEXT: shrq $2, %rcx
|
|
; SSE2-NEXT: andq %rdx, %rcx
|
|
; SSE2-NEXT: leaq (%rcx,%rsi,4), %rcx
|
|
; SSE2-NEXT: movq %rcx, %rdx
|
|
; SSE2-NEXT: andq %rdi, %rdx
|
|
; SSE2-NEXT: shrq %rcx
|
|
; SSE2-NEXT: andq %rdi, %rcx
|
|
; SSE2-NEXT: leaq (%rcx,%rdx,2), %rcx
|
|
; SSE2-NEXT: movq %rcx, 24(%rax)
|
|
; SSE2-NEXT: movq %r9, 16(%rax)
|
|
; SSE2-NEXT: movq %r10, 8(%rax)
|
|
; SSE2-NEXT: movq %r8, (%rax)
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE42-LABEL: bitreverse_i256_load:
|
|
; SSE42: # %bb.0:
|
|
; SSE42-NEXT: movq %rdi, %rax
|
|
; SSE42-NEXT: movdqa (%rsi), %xmm0
|
|
; SSE42-NEXT: movdqa 16(%rsi), %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
|
|
; SSE42-NEXT: pshufb %xmm2, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; SSE42-NEXT: movdqa %xmm1, %xmm4
|
|
; SSE42-NEXT: pand %xmm3, %xmm4
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; SSE42-NEXT: movdqa %xmm5, %xmm6
|
|
; SSE42-NEXT: pshufb %xmm4, %xmm6
|
|
; SSE42-NEXT: psrlw $4, %xmm1
|
|
; SSE42-NEXT: pand %xmm3, %xmm1
|
|
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; SSE42-NEXT: movdqa %xmm4, %xmm7
|
|
; SSE42-NEXT: pshufb %xmm1, %xmm7
|
|
; SSE42-NEXT: por %xmm6, %xmm7
|
|
; SSE42-NEXT: pshufb %xmm2, %xmm0
|
|
; SSE42-NEXT: movdqa %xmm0, %xmm1
|
|
; SSE42-NEXT: pand %xmm3, %xmm1
|
|
; SSE42-NEXT: pshufb %xmm1, %xmm5
|
|
; SSE42-NEXT: psrlw $4, %xmm0
|
|
; SSE42-NEXT: pand %xmm3, %xmm0
|
|
; SSE42-NEXT: pshufb %xmm0, %xmm4
|
|
; SSE42-NEXT: por %xmm5, %xmm4
|
|
; SSE42-NEXT: movdqa %xmm4, 16(%rdi)
|
|
; SSE42-NEXT: movdqa %xmm7, (%rdi)
|
|
; SSE42-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: bitreverse_i256_load:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
;
|
|
; AVX512F-LABEL: bitreverse_i256_load:
|
|
; AVX512F: # %bb.0:
|
|
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
|
|
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: bitreverse_i256_load:
|
|
; AVX512VL: # %bb.0:
|
|
; AVX512VL-NEXT: movq %rdi, %rax
|
|
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
|
|
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VL-NEXT: vzeroupper
|
|
; AVX512VL-NEXT: retq
|
|
;
|
|
; AVX512VBMI-LABEL: bitreverse_i256_load:
|
|
; AVX512VBMI: # %bb.0:
|
|
; AVX512VBMI-NEXT: movq %rdi, %rax
|
|
; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
|
|
; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
|
|
; AVX512VBMI-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
|
|
; AVX512VBMI-NEXT: # ymm3 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
; AVX512VBMI-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
; AVX512VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
|
|
; AVX512VBMI-NEXT: # ymm1 = mem[0,1,0,1]
|
|
; AVX512VBMI-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
|
; AVX512VBMI-NEXT: vpor %ymm0, %ymm2, %ymm0
|
|
; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
|
|
; AVX512VBMI-NEXT: vzeroupper
|
|
; AVX512VBMI-NEXT: retq
|
|
%a0 = load i256, ptr %p0
|
|
%res = call i256 @llvm.bitreverse.i256(i256 %a0)
|
|
ret i256 %res
|
|
}
|
|
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; AVX512: {{.*}}
|
|
; CHECK: {{.*}}
|