You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
879 lines
42 KiB
879 lines
42 KiB
/*
|
|
|
|
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
|
|
|
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
|
Copyright (c) 2012, The University of Waikato
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the organization nor the
|
|
names of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
.globl _neon_x4
|
|
.align 4
|
|
_neon_x4:
|
|
|
|
.globl _neon_x8
|
|
.align 4
|
|
_neon_x8:
|
|
|
|
.globl _neon_x8_t
|
|
.align 4
|
|
_neon_x8_t:
|
|
|
|
|
|
#ifdef __APPLE__
|
|
.globl _leaf_ee_init
|
|
_leaf_ee_init:
|
|
#else
|
|
.globl leaf_ee_init
|
|
leaf_ee_init:
|
|
#endif
|
|
#lea L_sse_constants(%rip), %r9
|
|
movq 0xe0(%rdi), %r9
|
|
xorl %eax, %eax
|
|
# eax is loop counter (init to 0)
|
|
# rcx is loop max count
|
|
# rsi is 'in' base pointer
|
|
# rdx is 'out' base pointer
|
|
# r8 is offsets pointer
|
|
# r9 is constants pointer
|
|
# scratch: rax r11 r12
|
|
# .align 4, 0x90
|
|
|
|
# _leaf_ee + 9 needs 16 byte alignment
|
|
#ifdef __APPLE__
|
|
.globl _leaf_ee
|
|
_leaf_ee:
|
|
#else
|
|
.globl leaf_ee
|
|
leaf_ee:
|
|
#endif
|
|
movaps 32(%r9), %xmm0 #83.5
|
|
movaps (%r9), %xmm8 #83.5
|
|
LEAF_EE_1:
|
|
LEAF_EE_const_0:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
|
|
LEAF_EE_const_2:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
|
|
movaps %xmm7, %xmm6 #83.5
|
|
LEAF_EE_const_3:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
|
|
movaps %xmm12, %xmm11 #83.5
|
|
subps %xmm10, %xmm12 #83.5
|
|
addps %xmm10, %xmm11 #83.5
|
|
xorps %xmm8, %xmm12 #83.5
|
|
LEAF_EE_const_1:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
|
|
LEAF_EE_const_4:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
|
|
addps %xmm9, %xmm6 #83.5
|
|
subps %xmm9, %xmm7 #83.5
|
|
LEAF_EE_const_5:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
|
|
movaps %xmm10, %xmm9 #83.5
|
|
LEAF_EE_const_6:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
|
|
movaps %xmm6, %xmm5 #83.5
|
|
LEAF_EE_const_7:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
|
|
movaps %xmm3, %xmm15 #83.5
|
|
shufps $177, %xmm12, %xmm12 #83.5
|
|
movaps %xmm7, %xmm4 #83.5
|
|
movslq (%r8, %rax, 4), %r11 #83.44
|
|
subps %xmm13, %xmm10 #83.5
|
|
subps %xmm14, %xmm3 #83.5
|
|
addps %xmm11, %xmm5 #83.5
|
|
subps %xmm11, %xmm6 #83.5
|
|
subps %xmm12, %xmm4 #83.5
|
|
addps %xmm12, %xmm7 #83.5
|
|
addps %xmm13, %xmm9 #83.5
|
|
addps %xmm14, %xmm15 #83.5
|
|
movaps 16(%r9), %xmm12 #83.5
|
|
movaps %xmm9, %xmm1 #83.5
|
|
movaps 16(%r9), %xmm11 #83.5
|
|
movaps %xmm5, %xmm2 #83.5
|
|
mulps %xmm10, %xmm12 #83.5
|
|
subps %xmm15, %xmm9 #83.5
|
|
addps %xmm15, %xmm1 #83.5
|
|
mulps %xmm3, %xmm11 #83.5
|
|
addps %xmm1, %xmm2 #83.5
|
|
subps %xmm1, %xmm5 #83.5
|
|
shufps $177, %xmm10, %xmm10 #83.5
|
|
xorps %xmm8, %xmm9 #83.5
|
|
shufps $177, %xmm3, %xmm3 #83.5
|
|
movaps %xmm6, %xmm1 #83.5
|
|
mulps %xmm0, %xmm10 #83.5
|
|
movaps %xmm4, %xmm13 #83.5
|
|
mulps %xmm0, %xmm3 #83.5
|
|
subps %xmm10, %xmm12 #83.5
|
|
addps %xmm3, %xmm11 #83.5
|
|
movaps %xmm12, %xmm3 #83.5
|
|
movaps %xmm7, %xmm14 #83.5
|
|
shufps $177, %xmm9, %xmm9 #83.5
|
|
subps %xmm11, %xmm12 #83.5
|
|
addps %xmm11, %xmm3 #83.5
|
|
subps %xmm9, %xmm1 #83.5
|
|
addps %xmm9, %xmm6 #83.5
|
|
addps %xmm3, %xmm4 #83.5
|
|
subps %xmm3, %xmm13 #83.5
|
|
xorps %xmm8, %xmm12 #83.5
|
|
movaps %xmm2, %xmm3 #83.5
|
|
shufps $177, %xmm12, %xmm12 #83.5
|
|
movaps %xmm6, %xmm9 #83.5
|
|
movslq 8(%r8, %rax, 4), %r12 #83.59
|
|
movlhps %xmm4, %xmm3 #83.5
|
|
addq $4, %rax
|
|
shufps $238, %xmm4, %xmm2 #83.5
|
|
movaps %xmm1, %xmm4 #83.5
|
|
#movntdq %xmm3, (%rdx,%r11,4) #83.5
|
|
subps %xmm12, %xmm7 #83.5
|
|
addps %xmm12, %xmm14 #83.5
|
|
movlhps %xmm7, %xmm4 #83.5
|
|
shufps $238, %xmm7, %xmm1 #83.5
|
|
movaps %xmm5, %xmm7 #83.5
|
|
movlhps %xmm13, %xmm7 #83.5
|
|
movlhps %xmm14, %xmm9 #83.5
|
|
shufps $238, %xmm13, %xmm5 #83.5
|
|
shufps $238, %xmm14, %xmm6 #83.5
|
|
movaps %xmm3, (%rdx,%r11,4) #83.5
|
|
movaps %xmm4, 16(%rdx,%r11,4) #83.5
|
|
movaps %xmm7, 32(%rdx,%r11,4) #83.5
|
|
movaps %xmm9, 48(%rdx,%r11,4) #83.5
|
|
movaps %xmm2, (%rdx,%r12,4) #83.5
|
|
movaps %xmm1, 16(%rdx,%r12,4) #83.5
|
|
movaps %xmm5, 32(%rdx,%r12,4) #83.5
|
|
movaps %xmm6, 48(%rdx,%r12,4) #83.5
|
|
cmpq %rcx, %rax
|
|
jne LEAF_EE_1
|
|
|
|
|
|
|
|
# _leaf_oo + 4 needs to be 16 byte aligned
|
|
#ifdef __APPLE__
|
|
.globl _leaf_oo
|
|
_leaf_oo:
|
|
#else
|
|
.globl leaf_oo
|
|
leaf_oo:
|
|
#endif
|
|
movaps (%r9), %xmm5 #92.7
|
|
LEAF_OO_1:
|
|
LEAF_OO_const_0:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
|
|
movaps %xmm4, %xmm6 #93.5
|
|
LEAF_OO_const_1:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
|
|
LEAF_OO_const_2:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
|
|
addps %xmm7, %xmm6 #93.5
|
|
subps %xmm7, %xmm4 #93.5
|
|
LEAF_OO_const_3:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
|
|
movaps %xmm10, %xmm9 #93.5
|
|
LEAF_OO_const_4:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
|
|
movaps %xmm6, %xmm3 #93.5
|
|
LEAF_OO_const_5:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
|
|
movaps %xmm1, %xmm2 #93.5
|
|
LEAF_OO_const_6:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
|
|
movaps %xmm4, %xmm15 #93.5
|
|
LEAF_OO_const_7:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
|
|
movaps %xmm14, %xmm13 #93.5
|
|
movslq (%r8, %rax, 4), %r11 #83.44
|
|
subps %xmm8, %xmm10 #93.5
|
|
addps %xmm8, %xmm9 #93.5
|
|
addps %xmm11, %xmm2 #93.5
|
|
subps %xmm12, %xmm14 #93.5
|
|
subps %xmm11, %xmm1 #93.5
|
|
addps %xmm12, %xmm13 #93.5
|
|
addps %xmm9, %xmm3 #93.5
|
|
subps %xmm9, %xmm6 #93.5
|
|
xorps %xmm5, %xmm10 #93.5
|
|
xorps %xmm5, %xmm14 #93.5
|
|
shufps $177, %xmm10, %xmm10 #93.5
|
|
movaps %xmm2, %xmm9 #93.5
|
|
shufps $177, %xmm14, %xmm14 #93.5
|
|
movaps %xmm6, %xmm7 #93.5
|
|
movslq 8(%r8, %rax, 4), %r12 #83.59
|
|
addq $4, %rax #92.18
|
|
addps %xmm10, %xmm4 #93.5
|
|
addps %xmm13, %xmm9 #93.5
|
|
subps %xmm13, %xmm2 #93.5
|
|
subps %xmm10, %xmm15 #93.5
|
|
movaps %xmm1, %xmm13 #93.5
|
|
movaps %xmm2, %xmm8 #93.5
|
|
movlhps %xmm4, %xmm7 #93.5
|
|
subps %xmm14, %xmm13 #93.5
|
|
addps %xmm14, %xmm1 #93.5
|
|
shufps $238, %xmm4, %xmm6 #93.5
|
|
movaps %xmm3, %xmm14 #93.5
|
|
movaps %xmm9, %xmm4 #93.5
|
|
movlhps %xmm15, %xmm14 #93.5
|
|
movlhps %xmm13, %xmm4 #93.5
|
|
movlhps %xmm1, %xmm8 #93.5
|
|
shufps $238, %xmm15, %xmm3 #93.5
|
|
shufps $238, %xmm13, %xmm9 #93.5
|
|
shufps $238, %xmm1, %xmm2 #93.5
|
|
movaps %xmm14, (%rdx,%r11,4) #93.5
|
|
movaps %xmm7, 16(%rdx,%r11,4) #93.5
|
|
movaps %xmm4, 32(%rdx,%r11,4) #93.5
|
|
movaps %xmm8, 48(%rdx,%r11,4) #93.5
|
|
movaps %xmm3, (%rdx,%r12,4) #93.5
|
|
movaps %xmm6, 16(%rdx,%r12,4) #93.5
|
|
movaps %xmm9, 32(%rdx,%r12,4) #93.5
|
|
movaps %xmm2, 48(%rdx,%r12,4) #93.5
|
|
cmpq %rcx, %rax
|
|
jne LEAF_OO_1 # Prob 95% #92.14
|
|
|
|
#ifdef __APPLE__
|
|
.globl _leaf_eo
|
|
_leaf_eo:
|
|
#else
|
|
.globl leaf_eo
|
|
leaf_eo:
|
|
#endif
|
|
LEAF_EO_const_0:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
|
|
LEAF_EO_const_2:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
|
|
movaps %xmm9, %xmm11 #88.5
|
|
LEAF_EO_const_3:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
|
|
movaps %xmm7, %xmm6 #88.5
|
|
LEAF_EO_const_1:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
|
|
subps %xmm5, %xmm7 #88.5
|
|
addps %xmm4, %xmm11 #88.5
|
|
subps %xmm4, %xmm9 #88.5
|
|
addps %xmm5, %xmm6 #88.5
|
|
movaps (%r9), %xmm3 #88.5
|
|
movaps %xmm11, %xmm10 #88.5
|
|
xorps %xmm3, %xmm7 #88.5
|
|
movaps %xmm9, %xmm8 #88.5
|
|
shufps $177, %xmm7, %xmm7 #88.5
|
|
addps %xmm6, %xmm10 #88.5
|
|
subps %xmm6, %xmm11 #88.5
|
|
subps %xmm7, %xmm8 #88.5
|
|
addps %xmm7, %xmm9 #88.5
|
|
movslq 8(%r8, %rax, 4), %r12 #83.59
|
|
movaps %xmm10, %xmm2 #88.5
|
|
movslq (%r8, %rax, 4), %r11 #83.44
|
|
movaps %xmm11, %xmm1 #88.5
|
|
shufps $238, %xmm8, %xmm10 #88.5
|
|
shufps $238, %xmm9, %xmm11 #88.5
|
|
movaps %xmm10, (%rdx,%r12,4) #88.5
|
|
movaps %xmm11, 16(%rdx,%r12,4) #88.5
|
|
LEAF_EO_const_4:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
|
|
LEAF_EO_const_5:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
|
|
movaps %xmm15, %xmm14 #88.5
|
|
LEAF_EO_const_6:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
|
|
addps %xmm12, %xmm14 #88.5
|
|
subps %xmm12, %xmm15 #88.5
|
|
LEAF_EO_const_7:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
|
|
movaps %xmm4, %xmm5 #88.5
|
|
movaps %xmm14, %xmm7 #88.5
|
|
addps %xmm13, %xmm5 #88.5
|
|
subps %xmm13, %xmm4 #88.5
|
|
movlhps %xmm8, %xmm2 #88.5
|
|
movaps %xmm5, %xmm8 #88.5
|
|
movlhps %xmm15, %xmm7 #88.5
|
|
xorps %xmm3, %xmm15 #88.5
|
|
movaps %xmm5, %xmm6 #88.5
|
|
subps %xmm14, %xmm5 #88.5
|
|
addps %xmm14, %xmm6 #88.5
|
|
movlhps %xmm9, %xmm1 #88.5
|
|
movaps %xmm4, %xmm14 #88.5
|
|
movlhps %xmm4, %xmm8 #88.5
|
|
movaps %xmm1, %xmm12 #88.5
|
|
shufps $177, %xmm15, %xmm15 #88.5
|
|
movaps 0x30(%r9), %xmm11 #88.5
|
|
addq $4, %rax #90.5
|
|
subps %xmm15, %xmm14 #88.5
|
|
mulps %xmm7, %xmm11 #88.5
|
|
addps %xmm15, %xmm4 #88.5
|
|
movaps 0x30(%r9), %xmm9 #88.5
|
|
movaps 0x40(%r9), %xmm15 #88.5
|
|
shufps $177, %xmm7, %xmm7 #88.5
|
|
mulps %xmm8, %xmm9 #88.5
|
|
mulps %xmm15, %xmm7 #88.5
|
|
shufps $177, %xmm8, %xmm8 #88.5
|
|
subps %xmm7, %xmm11 #88.5
|
|
mulps %xmm15, %xmm8 #88.5
|
|
movaps %xmm11, %xmm10 #88.5
|
|
addps %xmm8, %xmm9 #88.5
|
|
shufps $238, %xmm14, %xmm6 #88.5
|
|
subps %xmm9, %xmm11 #88.5
|
|
addps %xmm9, %xmm10 #88.5
|
|
xorps %xmm3, %xmm11 #88.5
|
|
movaps %xmm2, %xmm3 #88.5
|
|
shufps $177, %xmm11, %xmm11 #88.5
|
|
subps %xmm10, %xmm3 #88.5
|
|
addps %xmm10, %xmm2 #88.5
|
|
addps %xmm11, %xmm12 #88.5
|
|
subps %xmm11, %xmm1 #88.5
|
|
shufps $238, %xmm4, %xmm5 #88.5
|
|
movaps %xmm5, 48(%rdx,%r12,4) #88.5
|
|
movaps %xmm6, 32(%rdx,%r12,4) #88.5
|
|
movaps %xmm2, (%rdx,%r11,4) #88.5
|
|
movaps %xmm1, 16(%rdx,%r11,4) #88.5
|
|
movaps %xmm3, 32(%rdx,%r11,4) #88.5
|
|
movaps %xmm12, 48(%rdx,%r11,4) #88.5
|
|
|
|
|
|
#ifdef __APPLE__
|
|
.globl _leaf_oe
|
|
_leaf_oe:
|
|
#else
|
|
.globl leaf_oe
|
|
leaf_oe:
|
|
#endif
|
|
movaps (%r9), %xmm0 #59.5
|
|
#movaps 0x20(%r9), %xmm1 #59.5
|
|
LEAF_OE_const_2:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
|
|
LEAF_OE_const_3:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
|
|
movaps %xmm6, %xmm10 #70.5
|
|
shufps $228, %xmm8, %xmm10 #70.5
|
|
movaps %xmm10, %xmm9 #70.5
|
|
shufps $228, %xmm6, %xmm8 #70.5
|
|
LEAF_OE_const_0:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
|
|
LEAF_OE_const_1:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
|
|
movaps %xmm12, %xmm14 #70.5
|
|
movslq (%r8, %rax, 4), %r11 #83.44
|
|
addps %xmm8, %xmm9 #70.5
|
|
subps %xmm8, %xmm10 #70.5
|
|
addps %xmm7, %xmm14 #70.5
|
|
subps %xmm7, %xmm12 #70.5
|
|
movaps %xmm9, %xmm4 #70.5
|
|
movaps %xmm14, %xmm13 #70.5
|
|
shufps $238, %xmm10, %xmm4 #70.5
|
|
xorps %xmm0, %xmm10 #70.5
|
|
shufps $177, %xmm10, %xmm10 #70.5
|
|
movaps %xmm12, %xmm11 #70.5
|
|
movaps %xmm14, %xmm5 #70.5
|
|
addps %xmm9, %xmm13 #70.5
|
|
subps %xmm10, %xmm11 #70.5
|
|
subps %xmm9, %xmm14 #70.5
|
|
shufps $238, %xmm12, %xmm5 #70.5
|
|
addps %xmm10, %xmm12 #70.5
|
|
movslq 8(%r8, %rax, 4), %r12 #83.59
|
|
movlhps %xmm11, %xmm13 #70.5
|
|
movaps %xmm13, (%rdx,%r11,4) #70.5
|
|
movaps 0x30(%r9), %xmm13 #70.5
|
|
movlhps %xmm12, %xmm14 #70.5
|
|
movaps 0x40(%r9), %xmm12 #70.5
|
|
mulps %xmm5, %xmm13 #70.5
|
|
shufps $177, %xmm5, %xmm5 #70.5
|
|
mulps %xmm12, %xmm5 #70.5
|
|
movaps %xmm14, 16(%rdx,%r11,4) #70.5
|
|
subps %xmm5, %xmm13 #70.5
|
|
movaps 0x30(%r9), %xmm5 #70.5
|
|
mulps %xmm4, %xmm5 #70.5
|
|
shufps $177, %xmm4, %xmm4 #70.5
|
|
mulps %xmm12, %xmm4 #70.5
|
|
LEAF_OE_const_4:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
|
|
addps %xmm4, %xmm5 #70.5
|
|
LEAF_OE_const_6:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
|
|
movaps %xmm9, %xmm3 #70.5
|
|
LEAF_OE_const_7:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
|
|
movaps %xmm7, %xmm6 #70.5
|
|
LEAF_OE_const_5:
|
|
movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
|
|
movaps %xmm13, %xmm4 #70.5
|
|
subps %xmm2, %xmm7 #70.5
|
|
addps %xmm15, %xmm3 #70.5
|
|
subps %xmm15, %xmm9 #70.5
|
|
addps %xmm2, %xmm6 #70.5
|
|
subps %xmm5, %xmm13 #70.5
|
|
addps %xmm5, %xmm4 #70.5
|
|
xorps %xmm0, %xmm7 #70.5
|
|
addq $4, %rax #72.5
|
|
movaps %xmm3, %xmm2 #70.5
|
|
shufps $177, %xmm7, %xmm7 #70.5
|
|
movaps %xmm9, %xmm8 #70.5
|
|
xorps %xmm0, %xmm13 #70.5
|
|
addps %xmm6, %xmm2 #70.5
|
|
subps %xmm7, %xmm8 #70.5
|
|
subps %xmm6, %xmm3 #70.5
|
|
addps %xmm7, %xmm9 #70.5
|
|
movaps %xmm2, %xmm10 #70.5
|
|
movaps %xmm3, %xmm11 #70.5
|
|
shufps $238, %xmm8, %xmm2 #70.5
|
|
shufps $238, %xmm9, %xmm3 #70.5
|
|
movaps %xmm2, %xmm14 #70.5
|
|
shufps $177, %xmm13, %xmm13 #70.5
|
|
subps %xmm4, %xmm14 #70.5
|
|
addps %xmm4, %xmm2 #70.5
|
|
movaps %xmm3, %xmm4 #70.5
|
|
subps %xmm13, %xmm3 #70.5
|
|
addps %xmm13, %xmm4 #70.5
|
|
movlhps %xmm8, %xmm10 #70.5
|
|
movlhps %xmm9, %xmm11 #70.5
|
|
movaps %xmm10, 32(%rdx,%r11,4) #70.5
|
|
movaps %xmm11, 48(%rdx,%r11,4) #70.5
|
|
movaps %xmm2, (%rdx,%r12,4) #70.5
|
|
movaps %xmm3, 16(%rdx,%r12,4) #70.5
|
|
movaps %xmm14, 32(%rdx,%r12,4) #70.5
|
|
movaps %xmm4, 48(%rdx,%r12,4) #70.5
|
|
|
|
|
|
#ifdef __APPLE__
|
|
.globl _leaf_end
|
|
_leaf_end:
|
|
#else
|
|
.globl leaf_end
|
|
leaf_end:
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
.globl _x_init
|
|
_x_init:
|
|
#else
|
|
.globl x_init
|
|
x_init:
|
|
#endif
|
|
#movaps L_sse_constants(%rip), %xmm3 #34.3
|
|
movaps (%r9), %xmm3 #34.3
|
|
movq 0x20(%rdi),%r8
|
|
#ifdef __APPLE__
|
|
.globl _x4
|
|
_x4:
|
|
#else
|
|
.globl x4
|
|
x4:
|
|
#endif
|
|
movaps 64(%rdx), %xmm0 #34.3
|
|
movaps 96(%rdx), %xmm1 #34.3
|
|
movaps (%rdx), %xmm7 #34.3
|
|
movaps (%r8), %xmm4 #const
|
|
movaps %xmm7, %xmm9 #34.3
|
|
movaps %xmm4, %xmm6 #34.3
|
|
movaps 16(%r8), %xmm2 #const
|
|
mulps %xmm0, %xmm6 #34.3
|
|
mulps %xmm1, %xmm4 #34.3
|
|
shufps $177, %xmm0, %xmm0 #34.3
|
|
shufps $177, %xmm1, %xmm1 #34.3
|
|
mulps %xmm2, %xmm0 #34.3
|
|
mulps %xmm1, %xmm2 #34.3
|
|
subps %xmm0, %xmm6 #34.3
|
|
addps %xmm2, %xmm4 #34.3
|
|
movaps %xmm6, %xmm5 #34.3
|
|
subps %xmm4, %xmm6 #34.3
|
|
addps %xmm4, %xmm5 #34.3
|
|
movaps 32(%rdx), %xmm8 #34.3
|
|
xorps %xmm3, %xmm6 #34.3
|
|
shufps $177, %xmm6, %xmm6 #34.3
|
|
movaps %xmm8, %xmm10 #34.3
|
|
movaps 112(%rdx), %xmm12 #34.3
|
|
subps %xmm5, %xmm9 #34.3
|
|
addps %xmm5, %xmm7 #34.3
|
|
addps %xmm6, %xmm10 #34.3
|
|
subps %xmm6, %xmm8 #34.3
|
|
movaps %xmm7, (%rdx) #34.3
|
|
movaps %xmm8, 32(%rdx) #34.3
|
|
movaps %xmm9, 64(%rdx) #34.3
|
|
movaps %xmm10, 96(%rdx) #34.3
|
|
movaps 32(%r8), %xmm14 #const #34.3
|
|
movaps 80(%rdx), %xmm11 #34.3
|
|
movaps %xmm14, %xmm0 #34.3
|
|
movaps 48(%r8), %xmm13 #const #34.3
|
|
mulps %xmm11, %xmm0 #34.3
|
|
mulps %xmm12, %xmm14 #34.3
|
|
shufps $177, %xmm11, %xmm11 #34.3
|
|
shufps $177, %xmm12, %xmm12 #34.3
|
|
mulps %xmm13, %xmm11 #34.3
|
|
mulps %xmm12, %xmm13 #34.3
|
|
subps %xmm11, %xmm0 #34.3
|
|
addps %xmm13, %xmm14 #34.3
|
|
movaps %xmm0, %xmm15 #34.3
|
|
subps %xmm14, %xmm0 #34.3
|
|
addps %xmm14, %xmm15 #34.3
|
|
xorps %xmm3, %xmm0 #34.3
|
|
movaps 16(%rdx), %xmm1 #34.3
|
|
movaps 48(%rdx), %xmm2 #34.3
|
|
movaps %xmm1, %xmm4 #34.3
|
|
shufps $177, %xmm0, %xmm0 #34.3
|
|
movaps %xmm2, %xmm5 #34.3
|
|
addps %xmm15, %xmm1 #34.3
|
|
subps %xmm0, %xmm2 #34.3
|
|
subps %xmm15, %xmm4 #34.3
|
|
addps %xmm0, %xmm5 #34.3
|
|
movaps %xmm1, 16(%rdx) #34.3
|
|
movaps %xmm2, 48(%rdx) #34.3
|
|
movaps %xmm4, 80(%rdx) #34.3
|
|
movaps %xmm5, 112(%rdx) #34.3
|
|
ret
|
|
|
|
# _x8_soft + 5 needs to be 16 byte aligned
|
|
#ifdef __APPLE__
|
|
.globl _x8_soft
|
|
_x8_soft:
|
|
#else
|
|
.globl x8_soft
|
|
x8_soft:
|
|
#endif
|
|
xorl %eax, %eax
|
|
movq %rdx, %rbx
|
|
movq %r8, %rsi
|
|
leaq (%rdx,%rcx,4), %r9
|
|
leaq (%r9,%rcx,4), %r10
|
|
leaq (%r10,%rcx,4), %r11
|
|
leaq (%r11,%rcx,4), %r12
|
|
leaq (%r12,%rcx,4), %r13
|
|
leaq (%r13,%rcx,4), %r14
|
|
leaq (%r14,%rcx,4), %r15
|
|
X8_soft_loop:
|
|
movaps (%rsi), %xmm9
|
|
movaps (%r10,%rax,4), %xmm6
|
|
movaps %xmm9, %xmm11
|
|
movaps (%r11,%rax,4), %xmm7
|
|
movaps 16(%rsi), %xmm8
|
|
mulps %xmm6, %xmm11
|
|
mulps %xmm7, %xmm9
|
|
shufps $177, %xmm6, %xmm6
|
|
mulps %xmm8, %xmm6
|
|
shufps $177, %xmm7, %xmm7
|
|
subps %xmm6, %xmm11
|
|
mulps %xmm7, %xmm8
|
|
movaps %xmm11, %xmm10
|
|
addps %xmm8, %xmm9
|
|
movaps 32(%rsi), %xmm15
|
|
addps %xmm9, %xmm10
|
|
subps %xmm9, %xmm11
|
|
movaps (%rbx,%rax,4), %xmm5
|
|
movaps %xmm15, %xmm6
|
|
movaps (%r12,%rax,4), %xmm12
|
|
movaps %xmm5, %xmm2
|
|
movaps (%r14,%rax,4), %xmm13
|
|
xorps %xmm3, %xmm11 #const
|
|
movaps 48(%rsi), %xmm14
|
|
subps %xmm10, %xmm2
|
|
mulps %xmm12, %xmm6
|
|
addps %xmm10, %xmm5
|
|
mulps %xmm13, %xmm15
|
|
movaps 64(%rsi), %xmm10
|
|
movaps %xmm5, %xmm0
|
|
shufps $177, %xmm12, %xmm12
|
|
shufps $177, %xmm13, %xmm13
|
|
mulps %xmm14, %xmm12
|
|
mulps %xmm13, %xmm14
|
|
subps %xmm12, %xmm6
|
|
addps %xmm14, %xmm15
|
|
movaps (%r13,%rax,4), %xmm7
|
|
movaps %xmm10, %xmm13
|
|
movaps (%r15,%rax,4), %xmm8
|
|
movaps %xmm6, %xmm12
|
|
movaps 80(%rsi), %xmm9
|
|
addq $96, %rsi
|
|
mulps %xmm7, %xmm13
|
|
subps %xmm15, %xmm6
|
|
addps %xmm15, %xmm12
|
|
mulps %xmm8, %xmm10
|
|
subps %xmm12, %xmm0
|
|
addps %xmm12, %xmm5
|
|
shufps $177, %xmm7, %xmm7
|
|
xorps %xmm3, %xmm6 #const
|
|
shufps $177, %xmm8, %xmm8
|
|
movaps %xmm2, %xmm12
|
|
mulps %xmm9, %xmm7
|
|
mulps %xmm8, %xmm9
|
|
subps %xmm7, %xmm13
|
|
addps %xmm9, %xmm10
|
|
movaps (%r9,%rax,4), %xmm4
|
|
shufps $177, %xmm11, %xmm11
|
|
movaps %xmm4, %xmm1
|
|
shufps $177, %xmm6, %xmm6
|
|
addps %xmm11, %xmm1
|
|
subps %xmm11, %xmm4
|
|
addps %xmm6, %xmm12
|
|
subps %xmm6, %xmm2
|
|
movaps %xmm13, %xmm11
|
|
movaps %xmm4, %xmm14
|
|
movaps %xmm1, %xmm6
|
|
subps %xmm10, %xmm13
|
|
addps %xmm10, %xmm11
|
|
xorps %xmm3, %xmm13 #const
|
|
addps %xmm11, %xmm4
|
|
subps %xmm11, %xmm14
|
|
shufps $177, %xmm13, %xmm13
|
|
movaps %xmm5, (%rbx,%rax,4)
|
|
movaps %xmm4, (%r9,%rax,4)
|
|
movaps %xmm2, (%r10,%rax,4)
|
|
subps %xmm13, %xmm1
|
|
addps %xmm13, %xmm6
|
|
movaps %xmm1, (%r11,%rax,4)
|
|
movaps %xmm0, (%r12,%rax,4)
|
|
movaps %xmm14, (%r13,%rax,4)
|
|
movaps %xmm12, (%r14,%rax,4)
|
|
movaps %xmm6, (%r15,%rax,4)
|
|
addq $4, %rax
|
|
cmpq %rcx, %rax
|
|
jne X8_soft_loop
|
|
ret
|
|
|
|
#ifdef __APPLE__
|
|
.globl _x8_hard
|
|
_x8_hard:
|
|
#else
|
|
.globl x8_hard
|
|
x8_hard:
|
|
#endif
|
|
movaps (%r9), %xmm5
|
|
X8_loop:
|
|
movaps (%r8), %xmm9
|
|
X8_const_2:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm6
|
|
movaps %xmm9, %xmm11
|
|
X8_const_3:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm7
|
|
movaps 16(%r8), %xmm8
|
|
mulps %xmm6, %xmm11
|
|
mulps %xmm7, %xmm9
|
|
shufps $177, %xmm6, %xmm6
|
|
mulps %xmm8, %xmm6
|
|
shufps $177, %xmm7, %xmm7
|
|
subps %xmm6, %xmm11
|
|
mulps %xmm7, %xmm8
|
|
movaps %xmm11, %xmm10
|
|
addps %xmm8, %xmm9
|
|
movaps 32(%r8), %xmm15
|
|
addps %xmm9, %xmm10
|
|
subps %xmm9, %xmm11
|
|
X8_const_0:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm3
|
|
movaps %xmm15, %xmm6
|
|
X8_const_4:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm12
|
|
movaps %xmm3, %xmm2
|
|
X8_const_6:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm13
|
|
xorps %xmm5, %xmm11
|
|
movaps 48(%r8), %xmm14
|
|
subps %xmm10, %xmm2
|
|
mulps %xmm12, %xmm6
|
|
addps %xmm10, %xmm3
|
|
mulps %xmm13, %xmm15
|
|
movaps 64(%r8), %xmm10
|
|
movaps %xmm3, %xmm0
|
|
shufps $177, %xmm12, %xmm12
|
|
shufps $177, %xmm13, %xmm13
|
|
mulps %xmm14, %xmm12
|
|
mulps %xmm13, %xmm14
|
|
subps %xmm12, %xmm6
|
|
addps %xmm14, %xmm15
|
|
X8_const_5:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm7
|
|
movaps %xmm10, %xmm13
|
|
X8_const_7:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm8
|
|
movaps %xmm6, %xmm12
|
|
movaps 80(%r8), %xmm9
|
|
addq $96, %r8
|
|
mulps %xmm7, %xmm13
|
|
subps %xmm15, %xmm6
|
|
addps %xmm15, %xmm12
|
|
mulps %xmm8, %xmm10
|
|
subps %xmm12, %xmm0
|
|
addps %xmm12, %xmm3
|
|
shufps $177, %xmm7, %xmm7
|
|
xorps %xmm5, %xmm6
|
|
shufps $177, %xmm8, %xmm8
|
|
movaps %xmm2, %xmm12
|
|
mulps %xmm9, %xmm7
|
|
mulps %xmm8, %xmm9
|
|
subps %xmm7, %xmm13
|
|
addps %xmm9, %xmm10
|
|
X8_const_1:
|
|
movaps 0xFECA(%rdx,%rax,4), %xmm4
|
|
shufps $177, %xmm11, %xmm11
|
|
movaps %xmm4, %xmm1
|
|
shufps $177, %xmm6, %xmm6
|
|
addps %xmm11, %xmm1
|
|
subps %xmm11, %xmm4
|
|
addps %xmm6, %xmm12
|
|
subps %xmm6, %xmm2
|
|
movaps %xmm13, %xmm11
|
|
movaps %xmm4, %xmm14
|
|
movaps %xmm1, %xmm6
|
|
subps %xmm10, %xmm13
|
|
addps %xmm10, %xmm11
|
|
xorps %xmm5, %xmm13
|
|
addps %xmm11, %xmm4
|
|
subps %xmm11, %xmm14
|
|
shufps $177, %xmm13, %xmm13
|
|
X8_const1_0:
|
|
movaps %xmm3, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_1:
|
|
movaps %xmm4, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_2:
|
|
movaps %xmm2, 0xFECA(%rdx,%rax,4)
|
|
subps %xmm13, %xmm1
|
|
addps %xmm13, %xmm6
|
|
X8_const1_3:
|
|
movaps %xmm1, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_4:
|
|
movaps %xmm0, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_5:
|
|
movaps %xmm14, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_6:
|
|
movaps %xmm12, 0xFECA(%rdx,%rax,4)
|
|
X8_const1_7:
|
|
movaps %xmm6, 0xFECA(%rdx,%rax,4)
|
|
addq $4, %rax
|
|
cmpq %rcx, %rax
|
|
jne X8_loop
|
|
|
|
#ifdef __APPLE__
|
|
.globl _sse_leaf_ee_offsets
|
|
.globl _sse_leaf_oo_offsets
|
|
.globl _sse_leaf_eo_offsets
|
|
.globl _sse_leaf_oe_offsets
|
|
.align 4
|
|
_sse_leaf_ee_offsets:
|
|
.long LEAF_EE_const_0-_leaf_ee+0x4
|
|
.long LEAF_EE_const_1-_leaf_ee+0x5
|
|
.long LEAF_EE_const_2-_leaf_ee+0x5
|
|
.long LEAF_EE_const_3-_leaf_ee+0x5
|
|
.long LEAF_EE_const_4-_leaf_ee+0x5
|
|
.long LEAF_EE_const_5-_leaf_ee+0x5
|
|
.long LEAF_EE_const_6-_leaf_ee+0x4
|
|
.long LEAF_EE_const_7-_leaf_ee+0x5
|
|
_sse_leaf_oo_offsets:
|
|
.long LEAF_OO_const_0-_leaf_oo+0x4
|
|
.long LEAF_OO_const_1-_leaf_oo+0x4
|
|
.long LEAF_OO_const_2-_leaf_oo+0x5
|
|
.long LEAF_OO_const_3-_leaf_oo+0x5
|
|
.long LEAF_OO_const_4-_leaf_oo+0x4
|
|
.long LEAF_OO_const_5-_leaf_oo+0x5
|
|
.long LEAF_OO_const_6-_leaf_oo+0x5
|
|
.long LEAF_OO_const_7-_leaf_oo+0x5
|
|
_sse_leaf_eo_offsets:
|
|
.long LEAF_EO_const_0-_leaf_eo+0x5
|
|
.long LEAF_EO_const_1-_leaf_eo+0x4
|
|
.long LEAF_EO_const_2-_leaf_eo+0x4
|
|
.long LEAF_EO_const_3-_leaf_eo+0x4
|
|
.long LEAF_EO_const_4-_leaf_eo+0x5
|
|
.long LEAF_EO_const_5-_leaf_eo+0x5
|
|
.long LEAF_EO_const_6-_leaf_eo+0x4
|
|
.long LEAF_EO_const_7-_leaf_eo+0x5
|
|
_sse_leaf_oe_offsets:
|
|
.long LEAF_OE_const_0-_leaf_oe+0x5
|
|
.long LEAF_OE_const_1-_leaf_oe+0x4
|
|
.long LEAF_OE_const_2-_leaf_oe+0x4
|
|
.long LEAF_OE_const_3-_leaf_oe+0x5
|
|
.long LEAF_OE_const_4-_leaf_oe+0x5
|
|
.long LEAF_OE_const_5-_leaf_oe+0x5
|
|
.long LEAF_OE_const_6-_leaf_oe+0x4
|
|
.long LEAF_OE_const_7-_leaf_oe+0x4
|
|
#else
|
|
.globl sse_leaf_ee_offsets
|
|
.globl sse_leaf_oo_offsets
|
|
.globl sse_leaf_eo_offsets
|
|
.globl sse_leaf_oe_offsets
|
|
.align 4
|
|
sse_leaf_ee_offsets:
|
|
.long LEAF_EE_const_0-leaf_ee+0x4
|
|
.long LEAF_EE_const_1-leaf_ee+0x5
|
|
.long LEAF_EE_const_2-leaf_ee+0x5
|
|
.long LEAF_EE_const_3-leaf_ee+0x5
|
|
.long LEAF_EE_const_4-leaf_ee+0x5
|
|
.long LEAF_EE_const_5-leaf_ee+0x5
|
|
.long LEAF_EE_const_6-leaf_ee+0x4
|
|
.long LEAF_EE_const_7-leaf_ee+0x5
|
|
sse_leaf_oo_offsets:
|
|
.long LEAF_OO_const_0-leaf_oo+0x4
|
|
.long LEAF_OO_const_1-leaf_oo+0x4
|
|
.long LEAF_OO_const_2-leaf_oo+0x5
|
|
.long LEAF_OO_const_3-leaf_oo+0x5
|
|
.long LEAF_OO_const_4-leaf_oo+0x4
|
|
.long LEAF_OO_const_5-leaf_oo+0x5
|
|
.long LEAF_OO_const_6-leaf_oo+0x5
|
|
.long LEAF_OO_const_7-leaf_oo+0x5
|
|
sse_leaf_eo_offsets:
|
|
.long LEAF_EO_const_0-leaf_eo+0x5
|
|
.long LEAF_EO_const_1-leaf_eo+0x4
|
|
.long LEAF_EO_const_2-leaf_eo+0x4
|
|
.long LEAF_EO_const_3-leaf_eo+0x4
|
|
.long LEAF_EO_const_4-leaf_eo+0x5
|
|
.long LEAF_EO_const_5-leaf_eo+0x5
|
|
.long LEAF_EO_const_6-leaf_eo+0x4
|
|
.long LEAF_EO_const_7-leaf_eo+0x5
|
|
sse_leaf_oe_offsets:
|
|
.long LEAF_OE_const_0-leaf_oe+0x5
|
|
.long LEAF_OE_const_1-leaf_oe+0x4
|
|
.long LEAF_OE_const_2-leaf_oe+0x4
|
|
.long LEAF_OE_const_3-leaf_oe+0x5
|
|
.long LEAF_OE_const_4-leaf_oe+0x5
|
|
.long LEAF_OE_const_5-leaf_oe+0x5
|
|
.long LEAF_OE_const_6-leaf_oe+0x4
|
|
.long LEAF_OE_const_7-leaf_oe+0x4
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
.data
|
|
#else
|
|
.section .data
|
|
#endif
|
|
.p2align 4
|
|
#ifdef __APPLE__
|
|
.globl _sse_constants
|
|
_sse_constants:
|
|
#else
|
|
.globl sse_constants
|
|
sse_constants:
|
|
#endif
|
|
.long 0x00000000,0x80000000,0x00000000,0x80000000
|
|
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
|
|
.long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
|
|
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
|
|
.long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
|
|
#ifdef __APPLE__
|
|
.globl _sse_constants_inv
|
|
_sse_constants_inv:
|
|
#else
|
|
.globl sse_constants_inv
|
|
sse_constants_inv:
|
|
#endif
|
|
.long 0x80000000,0x00000000,0x80000000,0x00000000
|
|
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
|
|
.long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
|
|
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
|
|
.long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
|