You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
614 lines
35 KiB
614 lines
35 KiB
/*
|
|
* img_x86_common.h - common x86/x86-64 assembly macros
|
|
* Written by Andrew Church <achurch@achurch.org>
|
|
*
|
|
* This file is part of transcode, a video stream processing tool.
|
|
* transcode is free software, distributable under the terms of the GNU
|
|
* General Public License (version 2 or later). See the file COPYING
|
|
* for details.
|
|
*/
|
|
|
|
#ifndef ACLIB_IMG_X86_COMMON_H
|
|
#define ACLIB_IMG_X86_COMMON_H
|
|
|
|
/*************************************************************************/
|
|
|
|
/* Register names for pointers */
|
|
#ifdef ARCH_X86_64
|
|
# define EAX "%%rax"
|
|
# define EBX "%%rbx"
|
|
# define ECX "%%rcx"
|
|
# define EDX "%%rdx"
|
|
# define ESP "%%rsp"
|
|
# define EBP "%%rbp"
|
|
# define ESI "%%rsi"
|
|
# define EDI "%%rdi"
|
|
#else
|
|
# define EAX "%%eax"
|
|
# define EBX "%%ebx"
|
|
# define ECX "%%ecx"
|
|
# define EDX "%%edx"
|
|
# define ESP "%%esp"
|
|
# define EBP "%%ebp"
|
|
# define ESI "%%esi"
|
|
# define EDI "%%edi"
|
|
#endif
|
|
|
|
/* Macros to push and pop one or two registers within an assembly block.
|
|
* The x86-64 ABI allows leaf functions to write to 128 bytes BELOW
|
|
* (yes, below) the stack pointer, so we can't just push our own stuff
|
|
* there. Argh. */
|
|
#ifdef ARCH_X86_64
|
|
# define FAKE_PUSH_REG "r12"
|
|
# define FAKE_PUSH_REG_2 "r13"
|
|
# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG
|
|
# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG
|
|
# define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg
|
|
# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2
|
|
# define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1)
|
|
#else
|
|
# define COMMA_FAKE_PUSH_REG /*nothing*/
|
|
# define PUSH(reg) "push " reg
|
|
# define POP(reg) "pop " reg
|
|
# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2
|
|
# define POP2(reg2,reg1) "pop " reg2 "; pop " reg1
|
|
#endif
|
|
|
|
/* Data for isolating particular bytes. Used by the SWAP32 macros; if you
|
|
* use them, make sure to define DEFINE_MASK_DATA before including this
|
|
* file! */
|
|
#ifdef DEFINE_MASK_DATA
|
|
static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{
|
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
|
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
|
|
0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00,
|
|
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
|
|
0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000,
|
|
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
|
|
0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00,
|
|
0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
|
|
0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
|
|
0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF,
|
|
0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00,
|
|
0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF,
|
|
0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000,
|
|
0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF,
|
|
0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
}};
|
|
#endif
|
|
|
|
/*************************************************************************/
|
|
|
|
/* Basic assembly macros, used for odd-count loops */
|
|
|
|
/* Swap bytes in pairs of 16-bit values */
|
|
#define X86_SWAP16_2 \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
movl %%eax, %%edx \n\
|
|
shll $8, %%eax \n\
|
|
andl $0xFF00FF00, %%eax \n\
|
|
shrl $8, %%edx \n\
|
|
andl $0x00FF00FF, %%edx \n\
|
|
orl %%edx, %%eax \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/* Swap words in a 32-bit value */
|
|
#define X86_SWAP32 \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
roll $16, %%eax \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/* Swap bytes 0 and 2 of a 32-bit value */
|
|
#define X86_SWAP32_02 \
|
|
"movw -4("ESI","ECX",4), %%ax \n\
|
|
movw -2("ESI","ECX",4), %%dx \n\
|
|
xchg %%dl, %%al \n\
|
|
movw %%ax, -4("EDI","ECX",4) \n\
|
|
movw %%dx, -2("EDI","ECX",4)"
|
|
|
|
/* Swap bytes 1 and 3 of a 32-bit value */
|
|
#define X86_SWAP32_13 \
|
|
"movw -4("ESI","ECX",4), %%ax \n\
|
|
movw -2("ESI","ECX",4), %%dx \n\
|
|
xchg %%dh, %%ah \n\
|
|
movw %%ax, -4("EDI","ECX",4) \n\
|
|
movw %%dx, -2("EDI","ECX",4)"
|
|
|
|
/* Reverse the order of bytes in a 32-bit value */
|
|
#define X86_REV32 \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
xchg %%ah, %%al \n\
|
|
roll $16, %%eax \n\
|
|
xchg %%ah, %%al \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/* The same, using the BSWAP instruction */
|
|
#define X86_REV32_BSWAP \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
bswap %%eax \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/* Rotate a 32-bit value left 8 bits */
|
|
#define X86_ROL32 \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
roll $8, %%eax \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/* Rotate a 32-bit value right 8 bits */
|
|
#define X86_ROR32 \
|
|
"movl -4("ESI","ECX",4), %%eax \n\
|
|
rorl $8, %%eax \n\
|
|
movl %%eax, -4("EDI","ECX",4)"
|
|
|
|
/*************************************************************************/
|
|
|
|
/* Basic assembly routines. Sizes are all given in 32-bit units. */
|
|
|
|
#define ASM_SWAP16_2_X86(size) \
|
|
asm("0: "X86_SWAP16_2" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax", "edx")
|
|
|
|
#define ASM_SWAP32_X86(size) \
|
|
asm("0: "X86_SWAP32" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax", "edx")
|
|
|
|
#define ASM_SWAP32_02_X86(size) \
|
|
asm("0: "X86_SWAP32_02" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax", "edx")
|
|
|
|
#define ASM_SWAP32_13_X86(size) \
|
|
asm("0: "X86_SWAP32_13" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax", "edx")
|
|
|
|
#define ASM_REV32_X86(size) \
|
|
asm("0: "X86_REV32" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
#define ASM_ROL32_X86(size) \
|
|
asm("0: "X86_ROL32" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
#define ASM_ROR32_X86(size) \
|
|
asm("0: "X86_ROR32" \n\
|
|
subl $1, %%ecx \n\
|
|
jnz 0b" \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
/*************************************************************************/
|
|
/*************************************************************************/
|
|
|
|
/* Wrapper for SIMD loops. This generates the body of an asm() construct
|
|
* (the string only, not the input/output/clobber lists) given the data
|
|
* block size (number of data units processed per SIMD loop iteration),
|
|
* instructions to save and restore unclobberable registers (such as EBX),
|
|
* and the bodies of the odd-count and main loops. The data count is
|
|
* assumed to be preloaded in ECX. Parameters are:
|
|
* blocksize: number of units of data processed per SIMD loop (must be
|
|
* a power of 2); can be a constant or a numerical
|
|
* expression containing only constants
|
|
* push_regs: string constant containing instructions to push registers
|
|
* that must be saved over the small loop
|
|
* pop_regs: string constant containing instructions to pop registers
|
|
* saved by `push_regs' (restored before the main loop)
|
|
* small_loop: loop for handling data elements one at a time (when the
|
|
* count is not a multiple of `blocksize'
|
|
* main_loop: main SIMD loop for processing data
|
|
* emms: EMMS/SFENCE instructions to end main loop with, as needed
|
|
*/
|
|
|
|
#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \
|
|
/* Check whether the count is a multiple of the blocksize (this \
|
|
* can cause branch mispredicts but seems to be faster overall) */ \
|
|
"testl $(("#blocksize")-1), %%ecx; " \
|
|
"jz 1f; " \
|
|
/* It's not--run the small loop to align the count */ \
|
|
push_regs"; " \
|
|
"0: " \
|
|
small_loop"; " \
|
|
"subl $1, %%ecx; " \
|
|
"testl $(("#blocksize")-1), %%ecx; " \
|
|
"jnz 0b; " \
|
|
pop_regs"; " \
|
|
/* Make sure there's some data left */ \
|
|
"testl %%ecx, %%ecx; " \
|
|
"jz 2f; " \
|
|
/* Now run the main SIMD loop */ \
|
|
"1: " \
|
|
main_loop"; " \
|
|
"subl $("#blocksize"), %%ecx; " \
|
|
"jnz 1b; " \
|
|
/* Clear MMX state and/or SFENCE, as needed */ \
|
|
emms"; " \
|
|
/* Done */ \
|
|
"2: "
|
|
|
|
/*************************************************************************/
|
|
|
|
/* MMX- and SSE2-optimized swap/rotate routines. These routines are
|
|
* identical save for data size, so we use common macros to implement them,
|
|
* with register names and data offsets replaced by parameters to the
|
|
* macros. */
|
|
|
|
#define ASM_SIMD_MMX(name,size) \
|
|
name((size), 64, \
|
|
"movq", "movq", "movq", "", \
|
|
"%%mm0", "%%mm1", "%%mm2", "%%mm3", \
|
|
"%%mm4", "%%mm5", "%%mm6", "%%mm7")
|
|
#define ASM_SIMD_SSE2(name,size) \
|
|
name((size), 128, \
|
|
"movdqu", "movdqa", "movdqu", "", \
|
|
"%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
|
|
"%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
|
|
#define ASM_SIMD_SSE2_ALIGNED(name,size) \
|
|
name((size), 128, \
|
|
"movdqa", "movdqa", "movntdq", "sfence",\
|
|
"%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
|
|
"%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
|
|
|
|
#define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size))
|
|
#define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size))
|
|
#define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size))
|
|
#define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size))
|
|
#define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size))
|
|
#define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size))
|
|
#define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size))
|
|
#define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size))
|
|
#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size))
|
|
#define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size))
|
|
#define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size))
|
|
#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size))
|
|
#define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size))
|
|
#define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size))
|
|
#define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size))
|
|
#define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size))
|
|
#define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size))
|
|
#define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size))
|
|
#define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size))
|
|
#define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size))
|
|
#define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size))
|
|
|
|
/*************************************************************************/
|
|
|
|
/* Actual implementations. Note that unrolling the SIMD loops doesn't seem
|
|
* to be a win (only 2-3% improvement at most), and in fact can lose by a
|
|
* bit in short loops. */
|
|
|
|
#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "", \
|
|
/* pop_regs */ "", \
|
|
/* small_loop */ X86_SWAP16_2, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\
|
|
psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\
|
|
por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax", "edx")
|
|
|
|
#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "", \
|
|
/* pop_regs */ "", \
|
|
/* small_loop */ X86_SWAP32, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\
|
|
pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\
|
|
por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "push "EDX, \
|
|
/* pop_regs */ "pop "EDX, \
|
|
/* small_loop */ X86_SWAP32_02, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
|
|
pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\
|
|
pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\
|
|
pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\
|
|
psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\
|
|
pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\
|
|
por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\
|
|
por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
|
|
"m" (mask_data) \
|
|
: "eax")
|
|
|
|
#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "push "EDX, \
|
|
/* pop_regs */ "pop "EDX, \
|
|
/* small_loop */ X86_SWAP32_13, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
|
|
pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\
|
|
pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\
|
|
pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\
|
|
psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\
|
|
pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\
|
|
por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\
|
|
por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
|
|
"m" (mask_data) \
|
|
: "eax");
|
|
|
|
#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "", \
|
|
/* pop_regs */ "", \
|
|
/* small_loop */ X86_REV32_BSWAP, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\
|
|
psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\
|
|
pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\
|
|
psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\
|
|
pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\
|
|
pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\
|
|
pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\
|
|
por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\
|
|
por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\
|
|
por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
|
|
"m" (mask_data) \
|
|
: "eax")
|
|
|
|
#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "", \
|
|
/* pop_regs */ "", \
|
|
/* small_loop */ X86_ROL32, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\
|
|
psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\
|
|
por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
|
|
asm(SIMD_LOOP_WRAPPER( \
|
|
/* blocksize */ (regsize)/32, \
|
|
/* push_regs */ "", \
|
|
/* pop_regs */ "", \
|
|
/* small_loop */ X86_ROR32, \
|
|
/* main_loop */ \
|
|
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
|
|
# MM0: 7 6 5 4 3 2 1 0 \n\
|
|
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
|
|
psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\
|
|
pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\
|
|
por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\
|
|
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
|
|
/* emms */ "emms; "sfence) \
|
|
: /* no outputs */ \
|
|
: "S" (src[0]), "D" (dest[0]), "c" (size) \
|
|
: "eax")
|
|
|
|
/*************************************************************************/
|
|
|
|
/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as
|
|
* 16-bit values, used for RGB->YUV and RGB->grayscale conversions.
|
|
* ZERO is the number of the XMM register containing all zeroes. */
|
|
|
|
#define SSE2_LOAD_RGB24(ZERO) \
|
|
"movl -21("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\
|
|
movl -18("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\
|
|
movl -15("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\
|
|
movl -24("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\
|
|
movl -9("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\
|
|
movl -6("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\
|
|
movl -3("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\
|
|
movl -12("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\
|
|
SSE2_MASSAGE_RGBA32(ZERO)
|
|
|
|
#define SSE2_LOAD_BGR24(ZERO) \
|
|
"movl -21("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\
|
|
movl -18("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\
|
|
movl -15("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\
|
|
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\
|
|
movl -24("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\
|
|
movl -9("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\
|
|
movl -6("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\
|
|
movl -3("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\
|
|
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\
|
|
movl -12("ESI","EBX"), %%eax \n\
|
|
movd %%eax, %%xmm2 \n\
|
|
por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\
|
|
SSE2_MASSAGE_BGRA32(ZERO)
|
|
|
|
#define SSE2_LOAD_RGBA32(ZERO) "\
|
|
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\
|
|
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\
|
|
SSE2_MASSAGE_RGBA32(ZERO)
|
|
#define SSE2_MASSAGE_RGBA32(ZERO) "\
|
|
movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\
|
|
punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
|
|
punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\
|
|
movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
|
|
punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
|
|
punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\
|
|
movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
|
|
punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\
|
|
punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\
|
|
movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\
|
|
punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
|
|
punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
|
|
punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
|
|
|
|
#define SSE2_LOAD_BGRA32(ZERO) "\
|
|
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\
|
|
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\
|
|
SSE2_MASSAGE_BGRA32(ZERO)
|
|
#define SSE2_MASSAGE_BGRA32(ZERO) "\
|
|
movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\
|
|
punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
|
|
punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\
|
|
movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
|
|
punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
|
|
punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\
|
|
movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
|
|
punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\
|
|
punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\
|
|
movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\
|
|
punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
|
|
punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
|
|
punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
|
|
|
|
#define SSE2_LOAD_ARGB32(ZERO) "\
|
|
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\
|
|
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\
|
|
SSE2_MASSAGE_ARGB32(ZERO)
|
|
#define SSE2_MASSAGE_ARGB32(ZERO) "\
|
|
movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\
|
|
punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
|
|
punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\
|
|
movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
|
|
punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
|
|
punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\
|
|
movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
|
|
punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\
|
|
punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\
|
|
movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\
|
|
punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
|
|
punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
|
|
punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
|
|
|
|
#define SSE2_LOAD_ABGR32(ZERO) "\
|
|
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\
|
|
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\
|
|
SSE2_MASSAGE_ABGR32(ZERO)
|
|
#define SSE2_MASSAGE_ABGR32(ZERO) "\
|
|
movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\
|
|
punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
|
|
punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\
|
|
movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
|
|
punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
|
|
punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\
|
|
movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
|
|
punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\
|
|
punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\
|
|
movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\
|
|
punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
|
|
punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
|
|
punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
|
|
|
|
/*************************************************************************/
|
|
|
|
#endif /* ACLIB_IMG_X86_COMMON_H */
|
|
|
|
/*
|
|
* Local variables:
|
|
* c-file-style: "stroustrup"
|
|
* c-file-offsets: ((case-label . *) (statement-case-intro . *))
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
* vim: expandtab shiftwidth=4:
|
|
*/
|