You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

614 lines
35 KiB

/*
* img_x86_common.h - common x86/x86-64 assembly macros
* Written by Andrew Church <achurch@achurch.org>
*
* This file is part of transcode, a video stream processing tool.
* transcode is free software, distributable under the terms of the GNU
* General Public License (version 2 or later). See the file COPYING
* for details.
*/
#ifndef ACLIB_IMG_X86_COMMON_H
#define ACLIB_IMG_X86_COMMON_H
/*************************************************************************/
/* Register names for pointers */
#ifdef ARCH_X86_64
# define EAX "%%rax"
# define EBX "%%rbx"
# define ECX "%%rcx"
# define EDX "%%rdx"
# define ESP "%%rsp"
# define EBP "%%rbp"
# define ESI "%%rsi"
# define EDI "%%rdi"
#else
# define EAX "%%eax"
# define EBX "%%ebx"
# define ECX "%%ecx"
# define EDX "%%edx"
# define ESP "%%esp"
# define EBP "%%ebp"
# define ESI "%%esi"
# define EDI "%%edi"
#endif
/* Macros to push and pop one or two registers within an assembly block.
* The x86-64 ABI allows leaf functions to write to 128 bytes BELOW
* (yes, below) the stack pointer, so we can't just push our own stuff
* there. Argh. */
#ifdef ARCH_X86_64
# define FAKE_PUSH_REG "r12"
# define FAKE_PUSH_REG_2 "r13"
# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG
# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG
# define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg
# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2
# define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1)
#else
# define COMMA_FAKE_PUSH_REG /*nothing*/
# define PUSH(reg) "push " reg
# define POP(reg) "pop " reg
# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2
# define POP2(reg2,reg1) "pop " reg2 "; pop " reg1
#endif
/* Data for isolating particular bytes. Used by the SWAP32 macros; if you
* use them, make sure to define DEFINE_MASK_DATA before including this
* file! */
#ifdef DEFINE_MASK_DATA
static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,
0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00,
0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,
0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000,
0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,
0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00,
0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF,
0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00,
0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF,
0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000,
0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF,
0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00,
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
}};
#endif
/*************************************************************************/
/* Basic assembly macros, used for odd-count loops */
/* Swap bytes in pairs of 16-bit values */
#define X86_SWAP16_2 \
"movl -4("ESI","ECX",4), %%eax \n\
movl %%eax, %%edx \n\
shll $8, %%eax \n\
andl $0xFF00FF00, %%eax \n\
shrl $8, %%edx \n\
andl $0x00FF00FF, %%edx \n\
orl %%edx, %%eax \n\
movl %%eax, -4("EDI","ECX",4)"
/* Swap words in a 32-bit value */
#define X86_SWAP32 \
"movl -4("ESI","ECX",4), %%eax \n\
roll $16, %%eax \n\
movl %%eax, -4("EDI","ECX",4)"
/* Swap bytes 0 and 2 of a 32-bit value */
#define X86_SWAP32_02 \
"movw -4("ESI","ECX",4), %%ax \n\
movw -2("ESI","ECX",4), %%dx \n\
xchg %%dl, %%al \n\
movw %%ax, -4("EDI","ECX",4) \n\
movw %%dx, -2("EDI","ECX",4)"
/* Swap bytes 1 and 3 of a 32-bit value */
#define X86_SWAP32_13 \
"movw -4("ESI","ECX",4), %%ax \n\
movw -2("ESI","ECX",4), %%dx \n\
xchg %%dh, %%ah \n\
movw %%ax, -4("EDI","ECX",4) \n\
movw %%dx, -2("EDI","ECX",4)"
/* Reverse the order of bytes in a 32-bit value */
#define X86_REV32 \
"movl -4("ESI","ECX",4), %%eax \n\
xchg %%ah, %%al \n\
roll $16, %%eax \n\
xchg %%ah, %%al \n\
movl %%eax, -4("EDI","ECX",4)"
/* The same, using the BSWAP instruction */
#define X86_REV32_BSWAP \
"movl -4("ESI","ECX",4), %%eax \n\
bswap %%eax \n\
movl %%eax, -4("EDI","ECX",4)"
/* Rotate a 32-bit value left 8 bits */
#define X86_ROL32 \
"movl -4("ESI","ECX",4), %%eax \n\
roll $8, %%eax \n\
movl %%eax, -4("EDI","ECX",4)"
/* Rotate a 32-bit value right 8 bits */
#define X86_ROR32 \
"movl -4("ESI","ECX",4), %%eax \n\
rorl $8, %%eax \n\
movl %%eax, -4("EDI","ECX",4)"
/*************************************************************************/
/* Basic assembly routines. Sizes are all given in 32-bit units. */
#define ASM_SWAP16_2_X86(size) \
asm("0: "X86_SWAP16_2" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax", "edx")
#define ASM_SWAP32_X86(size) \
asm("0: "X86_SWAP32" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax", "edx")
#define ASM_SWAP32_02_X86(size) \
asm("0: "X86_SWAP32_02" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax", "edx")
#define ASM_SWAP32_13_X86(size) \
asm("0: "X86_SWAP32_13" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax", "edx")
#define ASM_REV32_X86(size) \
asm("0: "X86_REV32" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
#define ASM_ROL32_X86(size) \
asm("0: "X86_ROL32" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
#define ASM_ROR32_X86(size) \
asm("0: "X86_ROR32" \n\
subl $1, %%ecx \n\
jnz 0b" \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
/*************************************************************************/
/*************************************************************************/
/* Wrapper for SIMD loops. This generates the body of an asm() construct
* (the string only, not the input/output/clobber lists) given the data
* block size (number of data units processed per SIMD loop iteration),
* instructions to save and restore unclobberable registers (such as EBX),
* and the bodies of the odd-count and main loops. The data count is
* assumed to be preloaded in ECX. Parameters are:
* blocksize: number of units of data processed per SIMD loop (must be
* a power of 2); can be a constant or a numerical
* expression containing only constants
* push_regs: string constant containing instructions to push registers
* that must be saved over the small loop
* pop_regs: string constant containing instructions to pop registers
* saved by `push_regs' (restored before the main loop)
* small_loop: loop for handling data elements one at a time (when the
* count is not a multiple of `blocksize'
* main_loop: main SIMD loop for processing data
* emms: EMMS/SFENCE instructions to end main loop with, as needed
*/
#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \
/* Check whether the count is a multiple of the blocksize (this \
* can cause branch mispredicts but seems to be faster overall) */ \
"testl $(("#blocksize")-1), %%ecx; " \
"jz 1f; " \
/* It's not--run the small loop to align the count */ \
push_regs"; " \
"0: " \
small_loop"; " \
"subl $1, %%ecx; " \
"testl $(("#blocksize")-1), %%ecx; " \
"jnz 0b; " \
pop_regs"; " \
/* Make sure there's some data left */ \
"testl %%ecx, %%ecx; " \
"jz 2f; " \
/* Now run the main SIMD loop */ \
"1: " \
main_loop"; " \
"subl $("#blocksize"), %%ecx; " \
"jnz 1b; " \
/* Clear MMX state and/or SFENCE, as needed */ \
emms"; " \
/* Done */ \
"2: "
/*************************************************************************/
/* MMX- and SSE2-optimized swap/rotate routines. These routines are
* identical save for data size, so we use common macros to implement them,
* with register names and data offsets replaced by parameters to the
* macros. */
#define ASM_SIMD_MMX(name,size) \
name((size), 64, \
"movq", "movq", "movq", "", \
"%%mm0", "%%mm1", "%%mm2", "%%mm3", \
"%%mm4", "%%mm5", "%%mm6", "%%mm7")
#define ASM_SIMD_SSE2(name,size) \
name((size), 128, \
"movdqu", "movdqa", "movdqu", "", \
"%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
"%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
#define ASM_SIMD_SSE2_ALIGNED(name,size) \
name((size), 128, \
"movdqa", "movdqa", "movntdq", "sfence",\
"%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\
"%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7")
#define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size))
#define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size))
#define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size))
#define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size))
#define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size))
#define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size))
#define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size))
#define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size))
#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size))
#define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size))
#define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size))
#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size))
#define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size))
#define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size))
#define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size))
#define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size))
#define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size))
#define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size))
#define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size))
#define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size))
#define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size))
/*************************************************************************/
/* Actual implementations. Note that unrolling the SIMD loops doesn't seem
* to be a win (only 2-3% improvement at most), and in fact can lose by a
* bit in short loops. */
#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ X86_SWAP16_2, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\
psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\
por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax", "edx")
#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ X86_SWAP32, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\
pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\
por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "push "EDX, \
/* pop_regs */ "pop "EDX, \
/* small_loop */ X86_SWAP32_02, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\
pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\
pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\
psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\
pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\
por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\
por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
"m" (mask_data) \
: "eax")
#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "push "EDX, \
/* pop_regs */ "pop "EDX, \
/* small_loop */ X86_SWAP32_13, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\
pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\
pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\
psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\
pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\
por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\
por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
"m" (mask_data) \
: "eax");
#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ X86_REV32_BSWAP, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\
psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\
pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\
psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\
pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\
pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\
pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\
por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\
por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\
por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \
"m" (mask_data) \
: "eax")
#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ X86_ROL32, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\
psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\
por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \
asm(SIMD_LOOP_WRAPPER( \
/* blocksize */ (regsize)/32, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ X86_ROR32, \
/* main_loop */ \
ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\
# MM0: 7 6 5 4 3 2 1 0 \n\
"movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\
psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\
pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\
por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\
"stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \
/* emms */ "emms; "sfence) \
: /* no outputs */ \
: "S" (src[0]), "D" (dest[0]), "c" (size) \
: "eax")
/*************************************************************************/
/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as
* 16-bit values, used for RGB->YUV and RGB->grayscale conversions.
* ZERO is the number of the XMM register containing all zeroes. */
#define SSE2_LOAD_RGB24(ZERO) \
"movl -21("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\
movl -18("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\
movl -15("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\
movl -24("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\
movl -9("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\
movl -6("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\
movl -3("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\
movl -12("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\
SSE2_MASSAGE_RGBA32(ZERO)
#define SSE2_LOAD_BGR24(ZERO) \
"movl -21("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\
movl -18("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\
movl -15("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\
pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\
movl -24("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\
movl -9("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\
movl -6("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\
movl -3("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\
pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\
movl -12("ESI","EBX"), %%eax \n\
movd %%eax, %%xmm2 \n\
por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\
SSE2_MASSAGE_BGRA32(ZERO)
#define SSE2_LOAD_RGBA32(ZERO) "\
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\
SSE2_MASSAGE_RGBA32(ZERO)
#define SSE2_MASSAGE_RGBA32(ZERO) "\
movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\
punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\
movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\
punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\
movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\
punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\
punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\
punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
#define SSE2_LOAD_BGRA32(ZERO) "\
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\
SSE2_MASSAGE_BGRA32(ZERO)
#define SSE2_MASSAGE_BGRA32(ZERO) "\
movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\
punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\
movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\
punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\
movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\
punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\
punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\
movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\
punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
#define SSE2_LOAD_ARGB32(ZERO) "\
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\
SSE2_MASSAGE_ARGB32(ZERO)
#define SSE2_MASSAGE_ARGB32(ZERO) "\
movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\
punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\
movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\
punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\
movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\
punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\
punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\
movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\
punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
#define SSE2_LOAD_ABGR32(ZERO) "\
movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\
movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\
SSE2_MASSAGE_ABGR32(ZERO)
#define SSE2_MASSAGE_ABGR32(ZERO) "\
movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\
punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\
movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\
punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\
movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\
punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\
punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\
punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\
punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\
punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n"
/*************************************************************************/
#endif /* ACLIB_IMG_X86_COMMON_H */
/*
* Local variables:
* c-file-style: "stroustrup"
* c-file-offsets: ((case-label . *) (statement-case-intro . *))
* indent-tabs-mode: nil
* End:
*
* vim: expandtab shiftwidth=4:
*/