extra-dependencies/debian/transcode/transcode-1.1.7/aclib/memcpy.c

/*
 * memcpy.c - optimized memcpy() routines for aclib
 * Written by Andrew Church <achurch@achurch.org>
 *
 * This file is part of transcode, a video stream processing tool.
 * transcode is free software, distributable under the terms of the GNU
 * General Public License (version 2 or later).  See the file COPYING
 * for details.
 */

#include "ac.h"
#include "ac_internal.h"
#include <string.h>

/* Use memmove because memcpy isn't guaranteed to be ascending */
static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;

/*************************************************************************/

/* External interface */

void *ac_memcpy(void *dest, const void *src, size_t size)
{
    return (*memcpy_ptr)(dest, src, size);
}

/*************************************************************************/
/*************************************************************************/

/* Note the check for ARCH_X86 here: this is to prevent compilation of this
 * code on x86_64, since all x86_64 processors support SSE2, and because
 * this code is not set up to use the 64-bit registers for addressing on
 * x86_64. */

#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)

/* MMX-optimized routine, intended for PMMX/PII processors.
 * Nonstandard instructions used:
 *     (CPUID.MMX)   MOVQ
 */

static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
{
    asm("\
PENTIUM_LINE_SIZE = 32          # PMMX/PII cache line size              \n\
PENTIUM_CACHE_SIZE = 8192       # PMMX/PII total cache size             \n\
# Use only half because writes may touch the cache too (PII)            \n\
PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)        \n\
                                                                        \n\
        push %%ebx              # Save PIC register                     \n\
        push %%edi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        mov $64, %%ebx          # Constant                              \n\
                                                                        \n\
        cmp %%ebx, %%ecx                                                \n\
        jb mmx.memcpy_last      # Just use movs if <64 bytes            \n\
                                                                        \n\
        # First align destination address to a multiple of 8 bytes      \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax                                                \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS  // Because "lea 0f" requires a textrel
"       xchg %%eax, %%ecx                                               \n\
        mov %%ecx, %%edx                                                \n\
        repz movsb                                                      \n\
        mov %%eax, %%ecx                                                \n\
        mov %%edx, %%eax                                                \n"
#else
"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:     sub %%eax, %%ecx        # Update count                          \n\
                                                                        \n\
        # Now copy data in blocks                                       \n\
0:      mov %%ecx, %%edx        # EDX <- ECX >> 6 (cache lines to copy) \n\
        shr $6, %%edx                                                   \n\
        jz mmx.memcpy_last      # <64 bytes left?  Skip to end          \n\
        cmp $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
        jb 1f                   # Limit size of block                   \n\
        mov $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
1:      mov %%edx, %%eax        # EAX <- EDX << 6 (bytes to copy)       \n\
        shl $6, %%eax                                                   \n\
        sub %%eax, %%ecx        # Update remaining count                \n\
        add %%eax, %%esi        # Point to end of region to be block-copied\n\
2:      test %%eax, -32(%%esi)  # Touch each cache line in reverse order\n\
        test %%eax, -64(%%esi)                                          \n\
        sub %%ebx, %%esi        # Update pointer                        \n\
        sub %%ebx, %%eax        # And loop                              \n\
        jnz 2b                                                          \n\
        # Note that ESI now points to the beginning of the block        \n\
3:      movq   (%%esi), %%mm0   # Do the actual copy, 64 bytes at a time\n\
        movq  8(%%esi), %%mm1                                           \n\
        movq 16(%%esi), %%mm2                                           \n\
        movq 24(%%esi), %%mm3                                           \n\
        movq 32(%%esi), %%mm4                                           \n\
        movq 40(%%esi), %%mm5                                           \n\
        movq 48(%%esi), %%mm6                                           \n\
        movq 56(%%esi), %%mm7                                           \n\
        movq %%mm0,   (%%edi)                                           \n\
        movq %%mm1,  8(%%edi)                                           \n\
        movq %%mm2, 16(%%edi)                                           \n\
        movq %%mm3, 24(%%edi)                                           \n\
        movq %%mm4, 32(%%edi)                                           \n\
        movq %%mm5, 40(%%edi)                                           \n\
        movq %%mm6, 48(%%edi)                                           \n\
        movq %%mm7, 56(%%edi)                                           \n\
        add %%ebx, %%esi        # Update pointers                       \n\
        add %%ebx, %%edi                                                \n\
        dec %%edx               # And loop                              \n\
        jnz 3b                                                          \n\
        jmp 0b                                                          \n\
                                                                        \n\
mmx.memcpy_last:                                                        \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
        shr $2, %%eax                                                   \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        repz movsd                                                      \n\
        mov %%eax, %%ecx                                                \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n"
#endif
"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       repz movsb                                                      \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%ecx, %%edx                                                \n\
        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:                                                                     \n\
        # All done!                                                     \n\
        emms                    # Clean up MMX state                    \n\
        pop %%edi               # Restore destination (return value)    \n\
        pop %%ebx               # Restore PIC register                  \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* HAVE_ASM_MMX && ARCH_X86 */

/*************************************************************************/

#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)

/* SSE-optimized routine.  Backported from AMD64 routine below.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.MMX)   MOVQ
 *     (CPUID.SSE)   MOVNTQ
 */

static void *memcpy_sse(void *dest, const void *src, size_t bytes)
{
    asm("\
        push %%ebx              # Save PIC register                     \n\
        push %%edi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        cmp $64, %%ecx          # Skip block copy for small blocks      \n\
        jb sse.memcpy_last                                              \n\
                                                                        \n\
        mov $128, %%ebx         # Constant used later                   \n\
                                                                        \n\
        # First align destination address to a multiple of 8 bytes      \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax                                                \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        mov %%ecx, %%edx                                                \n\
        repz movsb                                                      \n\
        mov %%eax, %%ecx                                                \n\
        mov %%edx, %%eax                                                \n"
#else
"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:     sub %%eax, %%ecx        # Update count                          \n\
                                                                        \n\
        cmp $0x10040, %%ecx     # Is this a large block? (0x10040 is an \n\
                                # arbitrary value where prefetching and \n\
                                # write combining seem to start becoming\n\
                                # faster)                               \n\
        jae sse.memcpy_bp       # Yup, use prefetch copy                \n\
                                                                        \n\
sse.memcpy_small:               # Small block copy routine--no prefetch \n"
#if 0
"       mov %%ecx, %%edx        # EDX <- bytes to copy / 8              \n\
        shr $3, %%edx                                                   \n\
        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
        shl $3, %%eax                                                   \n\
        sub %%eax, %%ecx                                                \n\
        .balign 16                                                      \n\
0:      movq (%%esi), %%mm0     # Copy 8 bytes of data                  \n\
        movq %%mm0, (%%edi)                                             \n\
        add $8, %%esi           # Update pointers                       \n\
        add $8, %%edi                                                   \n\
        dec %%edx               # And loop                              \n\
        jg 0b                                                           \n\
        jmp sse.memcpy_last     # Copy any remaining bytes              \n\
                                                                        \n\
        nop                     # Align loops below                     \n"
#else
"       # It appears that a simple rep movs is faster than cleverness   \n\
        # with movq...                                                  \n\
        mov %%ecx, %%edx        # EDX <- ECX & 3                        \n\
        and $3, %%edx                                                   \n\
        shr $2, %%ecx           # ECX <- ECX >> 2                       \n\
        rep movsl               # Copy away!                            \n\
        mov %%edx, %%ecx        # Take care of last 0-3 bytes           \n\
        rep movsb                                                       \n\
        jmp sse.memcpy_end      # And exit                              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n"
#endif
"sse.memcpy_bp:                 # Block prefetch copy routine           \n\
0:      mov %%ecx, %%edx        # EDX: temp counter                     \n\
        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
        cmova %%ebx, %%edx                                              \n\
        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
                                #        (also used as memory offset)   \n\
1:      test %%eax, -64(%%esi,%%eax,8)  # Preload cache lines in pairs  \n\
        test %%eax, -128(%%esi,%%eax,8) # (going backwards)             \n\
        # (note that test %%eax,... seems to be faster than prefetchnta \n\
        #  on x86)                                                      \n\
        sub $16, %%eax          # And loop                              \n\
        jg 1b                                                           \n\
                                                                        \n\
        # Then copy--forward, which seems to be faster than reverse for \n\
        # certain alignments                                            \n\
        xor %%eax, %%eax                                                \n\
2:      movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop             \n\
        movntq %%mm0, (%%edi,%%eax,8)                                   \n\
        inc %%eax                                                       \n\
        cmp %%edx, %%eax                                                \n\
        jb 2b                                                           \n\
                                                                        \n\
        # Finally, update pointers and count, and loop                  \n\
        shl $3, %%edx           # EDX <- bytes copied                   \n\
        add %%edx, %%esi                                                \n\
        add %%edx, %%edi                                                \n\
        sub %%edx, %%ecx                                                \n\
        cmp $64, %%ecx          # At least one cache line left?         \n\
        jae 0b                  # Yup, loop                             \n\
                                                                        \n\
sse.memcpy_last:                                                        \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
        shr $2, %%eax                                                   \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        repz movsd                                                      \n\
        mov %%eax, %%ecx                                                \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n"
#endif
"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       repz movsb                                                      \n"
#else
"       lea sse.memcpy_end, %%edx                                       \n\
        sub %%ecx, %%edx                                                \n\
        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"                                                                       \n\
sse.memcpy_end:                                                         \n\
        # All done!                                                     \n\
        emms                    # Clean up after MMX instructions       \n\
        sfence                  # Flush the write buffer                \n\
        pop %%edi               # Restore destination (return value)    \n\
        pop %%ebx               # Restore PIC register                  \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* HAVE_ASM_SSE && ARCH_X86 */

/*************************************************************************/

#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)

/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
 * guide section 5.13: Appropriate Memory Copying Routines.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
 *
 * Note that this routine will also run more or less as-is (modulo register
 * names and label(%%rip) references) on x86 CPUs, but tests have shown the
 * SSE1 version above to be faster.
 */

/* The block copying code--macroized because we use two versions of it
 * depending on whether the source is 16-byte-aligned or not.  Pass either
 * movdqa or movdqu (unquoted) for the parameter. */
#define AMD64_BLOCK_MEMCPY(movdq) \
"       # First prefetch (note that if we end on an odd number of cache \n\
        # lines, we skip prefetching the last one--faster that way than \n\
        # prefetching line by line or treating it as a special case)    \n\
0:      mov %%ecx, %%edx        # EDX: temp counter (always <32 bits)   \n\
        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
        cmova %%ebx, %%edx                                              \n\
        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
                                #        (also used as memory offset)   \n\
1:      prefetchnta -64(%%rsi,%%rax,8)  # Preload cache lines in pairs  \n\
        prefetchnta -128(%%rsi,%%rax,8) # (going backwards)             \n\
        sub $16, %%eax          # And loop                              \n\
        jg 1b                                                           \n\
                                                                        \n\
        # Then copy--forward, which seems to be faster than reverse for \n\
        # certain alignments                                            \n\
        xor %%eax, %%eax                                                \n\
2:      " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop     \n\
        movntdq %%xmm0, (%%rdi,%%rax,8)                                 \n\
        add $2, %%eax                                                   \n\
        cmp %%edx, %%eax                                                \n\
        jb 2b                                                           \n\
                                                                        \n\
        # Finally, update pointers and count, and loop                  \n\
        shl $3, %%edx           # EDX <- bytes copied                   \n\
        add %%rdx, %%rsi                                                \n\
        add %%rdx, %%rdi                                                \n\
        sub %%rdx, %%rcx                                                \n\
        cmp $64, %%rcx          # At least one cache line left?         \n\
        jae 0b                  # Yup, loop                             \n"

static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
{
    asm("\
        push %%rdi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        cmp $64, %%rcx          # Skip block copy for small blocks      \n\
        jb amd64.memcpy_last                                            \n\
                                                                        \n\
        mov $128, %%ebx         # Constant used later                   \n\
                                                                        \n\
        # First align destination address to a multiple of 16 bytes     \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax        # (we don't care about the top 32 bits) \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n\
        lea 0f(%%rip), %%rdx    # Use a computed jump--faster than a loop\n\
        sub %%rax, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
0:      sub %%rax, %%rcx        # Update count                          \n\
        test $8, %%edi          # Is destination not 16-byte aligned?   \n\
        je 1f                                                           \n\
        movsq                   # Then move 8 bytes to align it         \n\
        sub $8, %%rcx                                                   \n\
                                                                        \n\
1:      cmp $0x38000, %%rcx     # Is this a large block? (0x38000 is an \n\
                                # arbitrary value where prefetching and \n\
                                # write combining seem to start becoming\n\
                                # faster)                               \n\
        jb amd64.memcpy_small   # Nope, use small copy (no prefetch/WC) \n\
        test $15, %%esi         # Is source also 16-byte aligned?       \n\
                                # (use ESI to save a REX prefix byte)   \n\
        jnz amd64.memcpy_normal_bp # Nope, use slow copy                \n\
        jmp amd64.memcpy_fast_bp # Yup, use fast copy                   \n\
                                                                        \n\
amd64.memcpy_small:             # Small block copy routine--no prefetch \n\
        mov %%ecx, %%edx        # EDX <- bytes to copy / 16             \n\
        shr $4, %%edx           # (count known to fit in 32 bits)       \n\
        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
        shl $4, %%eax                                                   \n\
        sub %%eax, %%ecx                                                \n\
        .balign 16                                                      \n\
0:      movdqu (%%rsi), %%xmm0  # Copy 16 bytes of data                 \n\
        movdqa %%xmm0, (%%rdi)                                          \n\
        add $16, %%rsi          # Update pointers                       \n\
        add $16, %%rdi                                                  \n\
        dec %%edx               # And loop                              \n\
        jnz 0b                                                          \n\
        jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n\
amd64.memcpy_fast_bp:           # Fast block prefetch loop              \n"
AMD64_BLOCK_MEMCPY(movdqa)
"       jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n\
amd64.memcpy_normal_bp:         # Normal (unaligned) block prefetch loop\n"
AMD64_BLOCK_MEMCPY(movdqu)
"                                                                       \n\
amd64.memcpy_last:                                                      \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>3                         \n\
        shr $3, %%eax                                                   \n\
        lea 0f(%%rip), %%rdx                                            \n\
        add %%eax, %%eax        # Watch out, MOVSQ is 2 bytes!          \n\
        sub %%rax, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSQ's                   \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
0:      and $7, %%ecx           # ECX <- ECX & 7                        \n\
        lea 0f(%%rip), %%rdx                                            \n\
        sub %%rcx, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
0:                                                                      \n\
        # All done!                                                     \n\
        emms                    # Clean up after MMX instructions       \n\
        sfence                  # Flush the write buffer                \n\
        pop %%rdi               # Restore destination (return value)    \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%rax", "%rbx", "%rdx"
    );
    return dest;
}

#endif  /* HAVE_ASM_SSE2 && ARCH_X86_64 */

/*************************************************************************/

/* Initialization routine. */

int ac_memcpy_init(int accel)
{
    memcpy_ptr = memmove;

#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
    if (HAS_ACCEL(accel, AC_MMX))
        memcpy_ptr = memcpy_mmx;
#endif

#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
        memcpy_ptr = memcpy_sse;
#endif

#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
        memcpy_ptr = memcpy_amd64;
#endif

    return 1;
}

/*************************************************************************/

/*
 * Local variables:
 *   c-file-style: "stroustrup"
 *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
 *   indent-tabs-mode: nil
 * End:
 *
 * vim: expandtab shiftwidth=4:
 */
Added debian extra dependency packages. Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it> 4 years ago			`/*`
			`* memcpy.c - optimized memcpy() routines for aclib`
			`* Written by Andrew Church <achurch@achurch.org>`
			`*`
			`* This file is part of transcode, a video stream processing tool.`
			`* transcode is free software, distributable under the terms of the GNU`
			`* General Public License (version 2 or later). See the file COPYING`
			`* for details.`
			`*/`

			`#include "ac.h"`
			`#include "ac_internal.h"`
			`#include <string.h>`

			`/* Use memmove because memcpy isn't guaranteed to be ascending */`
			`static void (memcpy_ptr)(void , const void , size_t) = memmove;`

			`/*************************************************************************/`

			`/* External interface */`

			`void ac_memcpy(void dest, const void *src, size_t size)`
			`{`
			`return (*memcpy_ptr)(dest, src, size);`
			`}`

			`/*************************************************************************/`
			`/*************************************************************************/`

			`/* Note the check for ARCH_X86 here: this is to prevent compilation of this`
			`* code on x86_64, since all x86_64 processors support SSE2, and because`
			`* this code is not set up to use the 64-bit registers for addressing on`
			`* x86_64. */`

			`#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)`

			`/* MMX-optimized routine, intended for PMMX/PII processors.`
			`* Nonstandard instructions used:`
			`* (CPUID.MMX) MOVQ`
			`*/`

			`static void memcpy_mmx(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\`
			`PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\`
			`# Use only half because writes may touch the cache too (PII) \n\`
			`PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\`
			`\n\`
			`push %%ebx # Save PIC register \n\`
			`push %%edi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`mov $64, %%ebx # Constant \n\`
			`\n\`
			`cmp %%ebx, %%ecx \n\`
			`jb mmx.memcpy_last # Just use movs if <64 bytes \n\`
			`\n\`
			`# First align destination address to a multiple of 8 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax \n\`
			`and $7, %%eax # ... which is the number of bytes to copy\n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS // Because "lea 0f" requires a textrel`
			`" xchg %%eax, %%ecx \n\`
			`mov %%ecx, %%edx \n\`
			`repz movsb \n\`
			`mov %%eax, %%ecx \n\`
			`mov %%edx, %%eax \n"`
			`#else`
			`" lea 0f, %%edx # Use a computed jump--faster than a loop\n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n"`
			`#endif`
			`"0: sub %%eax, %%ecx # Update count \n\`
			`\n\`
			`# Now copy data in blocks \n\`
			`0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\`
			`shr $6, %%edx \n\`
			`jz mmx.memcpy_last # <64 bytes left? Skip to end \n\`
			`cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\`
			`jb 1f # Limit size of block \n\`
			`mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\`
			`1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\`
			`shl $6, %%eax \n\`
			`sub %%eax, %%ecx # Update remaining count \n\`
			`add %%eax, %%esi # Point to end of region to be block-copied\n\`
			`2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\`
			`test %%eax, -64(%%esi) \n\`
			`sub %%ebx, %%esi # Update pointer \n\`
			`sub %%ebx, %%eax # And loop \n\`
			`jnz 2b \n\`
			`# Note that ESI now points to the beginning of the block \n\`
			`3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\`
			`movq 8(%%esi), %%mm1 \n\`
			`movq 16(%%esi), %%mm2 \n\`
			`movq 24(%%esi), %%mm3 \n\`
			`movq 32(%%esi), %%mm4 \n\`
			`movq 40(%%esi), %%mm5 \n\`
			`movq 48(%%esi), %%mm6 \n\`
			`movq 56(%%esi), %%mm7 \n\`
			`movq %%mm0, (%%edi) \n\`
			`movq %%mm1, 8(%%edi) \n\`
			`movq %%mm2, 16(%%edi) \n\`
			`movq %%mm3, 24(%%edi) \n\`
			`movq %%mm4, 32(%%edi) \n\`
			`movq %%mm5, 40(%%edi) \n\`
			`movq %%mm6, 48(%%edi) \n\`
			`movq %%mm7, 56(%%edi) \n\`
			`add %%ebx, %%esi # Update pointers \n\`
			`add %%ebx, %%edi \n\`
			`dec %%edx # And loop \n\`
			`jnz 3b \n\`
			`jmp 0b \n\`
			`\n\`
			`mmx.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>2 \n\`
			`shr $2, %%eax \n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS`
			`" xchg %%eax, %%ecx \n\`
			`repz movsd \n\`
			`mov %%eax, %%ecx \n"`
			`#else`
			`" lea 0f, %%edx \n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-15 MOVSD's \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n"`
			`#endif`
			`"0: and $3, %%ecx # ECX <- ECX & 3 \n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS`
			`" repz movsb \n"`
			`#else`
			`" lea 0f, %%edx \n\`
			`sub %%ecx, %%edx \n\`
			`jmp *%%edx # Execute 0-3 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n"`
			`#endif`
			`"0: \n\`
			`# All done! \n\`
			`emms # Clean up MMX state \n\`
			`pop %%edi # Restore destination (return value) \n\`
			`pop %%ebx # Restore PIC register \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%eax", "%edx"`
			`);`
			`return dest;`
			`}`

			`#endif /* HAVE_ASM_MMX && ARCH_X86 */`

			`/*************************************************************************/`

			`#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)`

			`/* SSE-optimized routine. Backported from AMD64 routine below.`
			`* Nonstandard instructions used:`
			`* (CPUID.CMOVE) CMOVA`
			`* (CPUID.MMX) MOVQ`
			`* (CPUID.SSE) MOVNTQ`
			`*/`

			`static void memcpy_sse(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`push %%ebx # Save PIC register \n\`
			`push %%edi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`cmp $64, %%ecx # Skip block copy for small blocks \n\`
			`jb sse.memcpy_last \n\`
			`\n\`
			`mov $128, %%ebx # Constant used later \n\`
			`\n\`
			`# First align destination address to a multiple of 8 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax \n\`
			`and $7, %%eax # ... which is the number of bytes to copy\n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS`
			`" xchg %%eax, %%ecx \n\`
			`mov %%ecx, %%edx \n\`
			`repz movsb \n\`
			`mov %%eax, %%ecx \n\`
			`mov %%edx, %%eax \n"`
			`#else`
			`" lea 0f, %%edx # Use a computed jump--faster than a loop\n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n"`
			`#endif`
			`"0: sub %%eax, %%ecx # Update count \n\`
			`\n\`
			`cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\`
			`# arbitrary value where prefetching and \n\`
			`# write combining seem to start becoming\n\`
			`# faster) \n\`
			`jae sse.memcpy_bp # Yup, use prefetch copy \n\`
			`\n\`
			`sse.memcpy_small: # Small block copy routine--no prefetch \n"`
			`#if 0`
			`" mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\`
			`shr $3, %%edx \n\`
			`mov %%edx, %%eax # Leave remainder in ECX for later \n\`
			`shl $3, %%eax \n\`
			`sub %%eax, %%ecx \n\`
			`.balign 16 \n\`
			`0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\`
			`movq %%mm0, (%%edi) \n\`
			`add $8, %%esi # Update pointers \n\`
			`add $8, %%edi \n\`
			`dec %%edx # And loop \n\`
			`jg 0b \n\`
			`jmp sse.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`nop # Align loops below \n"`
			`#else`
			`" # It appears that a simple rep movs is faster than cleverness \n\`
			`# with movq... \n\`
			`mov %%ecx, %%edx # EDX <- ECX & 3 \n\`
			`and $3, %%edx \n\`
			`shr $2, %%ecx # ECX <- ECX >> 2 \n\`
			`rep movsl # Copy away! \n\`
			`mov %%edx, %%ecx # Take care of last 0-3 bytes \n\`
			`rep movsb \n\`
			`jmp sse.memcpy_end # And exit \n\`
			`\n\`
			`.balign 16 \n\`
			`nop \n\`
			`nop \n"`
			`#endif`
			`"sse.memcpy_bp: # Block prefetch copy routine \n\`
			`0: mov %%ecx, %%edx # EDX: temp counter \n\`
			`shr $6, %%edx # Divide by cache line size (64 bytes) \n\`
			`cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\`
			`cmova %%ebx, %%edx \n\`
			`shl $3, %%edx # EDX <- cache lines to copy * 8 \n\`
			`mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\`
			`# (also used as memory offset) \n\`
			`1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\`
			`test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\`
			`# (note that test %%eax,... seems to be faster than prefetchnta \n\`
			`# on x86) \n\`
			`sub $16, %%eax # And loop \n\`
			`jg 1b \n\`
			`\n\`
			`# Then copy--forward, which seems to be faster than reverse for \n\`
			`# certain alignments \n\`
			`xor %%eax, %%eax \n\`
			`2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\`
			`movntq %%mm0, (%%edi,%%eax,8) \n\`
			`inc %%eax \n\`
			`cmp %%edx, %%eax \n\`
			`jb 2b \n\`
			`\n\`
			`# Finally, update pointers and count, and loop \n\`
			`shl $3, %%edx # EDX <- bytes copied \n\`
			`add %%edx, %%esi \n\`
			`add %%edx, %%edi \n\`
			`sub %%edx, %%ecx \n\`
			`cmp $64, %%ecx # At least one cache line left? \n\`
			`jae 0b # Yup, loop \n\`
			`\n\`
			`sse.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>2 \n\`
			`shr $2, %%eax \n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS`
			`" xchg %%eax, %%ecx \n\`
			`repz movsd \n\`
			`mov %%eax, %%ecx \n"`
			`#else`
			`" lea 0f, %%edx \n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-15 MOVSD's \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n"`
			`#endif`
			`"0: and $3, %%ecx # ECX <- ECX & 3 \n"`
			`#ifdef ACLIB_DISABLE_X86_TEXTRELS`
			`" repz movsb \n"`
			`#else`
			`" lea sse.memcpy_end, %%edx \n\`
			`sub %%ecx, %%edx \n\`
			`jmp *%%edx # Execute 0-3 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n"`
			`#endif`
			`" \n\`
			`sse.memcpy_end: \n\`
			`# All done! \n\`
			`emms # Clean up after MMX instructions \n\`
			`sfence # Flush the write buffer \n\`
			`pop %%edi # Restore destination (return value) \n\`
			`pop %%ebx # Restore PIC register \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%eax", "%edx"`
			`);`
			`return dest;`
			`}`

			`#endif /* HAVE_ASM_SSE && ARCH_X86 */`

			`/*************************************************************************/`

			`#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)`

			`/* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization`
			`* guide section 5.13: Appropriate Memory Copying Routines.`
			`* Nonstandard instructions used:`
			`* (CPUID.CMOVE) CMOVA`
			`* (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ`
			`*`
			`* Note that this routine will also run more or less as-is (modulo register`
			`* names and label(%%rip) references) on x86 CPUs, but tests have shown the`
			`* SSE1 version above to be faster.`
			`*/`

			`/* The block copying code--macroized because we use two versions of it`
			`* depending on whether the source is 16-byte-aligned or not. Pass either`
			`* movdqa or movdqu (unquoted) for the parameter. */`
			`#define AMD64_BLOCK_MEMCPY(movdq) \`
			`" # First prefetch (note that if we end on an odd number of cache \n\`
			`# lines, we skip prefetching the last one--faster that way than \n\`
			`# prefetching line by line or treating it as a special case) \n\`
			`0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\`
			`shr $6, %%edx # Divide by cache line size (64 bytes) \n\`
			`cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\`
			`cmova %%ebx, %%edx \n\`
			`shl $3, %%edx # EDX <- cache lines to copy * 8 \n\`
			`mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\`
			`# (also used as memory offset) \n\`
			`1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\`
			`prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\`
			`sub $16, %%eax # And loop \n\`
			`jg 1b \n\`
			`\n\`
			`# Then copy--forward, which seems to be faster than reverse for \n\`
			`# certain alignments \n\`
			`xor %%eax, %%eax \n\`
			`2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\`
			`movntdq %%xmm0, (%%rdi,%%rax,8) \n\`
			`add $2, %%eax \n\`
			`cmp %%edx, %%eax \n\`
			`jb 2b \n\`
			`\n\`
			`# Finally, update pointers and count, and loop \n\`
			`shl $3, %%edx # EDX <- bytes copied \n\`
			`add %%rdx, %%rsi \n\`
			`add %%rdx, %%rdi \n\`
			`sub %%rdx, %%rcx \n\`
			`cmp $64, %%rcx # At least one cache line left? \n\`
			`jae 0b # Yup, loop \n"`

			`static void memcpy_amd64(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`push %%rdi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`cmp $64, %%rcx # Skip block copy for small blocks \n\`
			`jb amd64.memcpy_last \n\`
			`\n\`
			`mov $128, %%ebx # Constant used later \n\`
			`\n\`
			`# First align destination address to a multiple of 16 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax # (we don't care about the top 32 bits) \n\`
			`and $7, %%eax # ... which is the number of bytes to copy\n\`
			`lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\`
			`sub %%rax, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: sub %%rax, %%rcx # Update count \n\`
			`test $8, %%edi # Is destination not 16-byte aligned? \n\`
			`je 1f \n\`
			`movsq # Then move 8 bytes to align it \n\`
			`sub $8, %%rcx \n\`
			`\n\`
			`1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\`
			`# arbitrary value where prefetching and \n\`
			`# write combining seem to start becoming\n\`
			`# faster) \n\`
			`jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\`
			`test $15, %%esi # Is source also 16-byte aligned? \n\`
			`# (use ESI to save a REX prefix byte) \n\`
			`jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\`
			`jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\`
			`\n\`
			`amd64.memcpy_small: # Small block copy routine--no prefetch \n\`
			`mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\`
			`shr $4, %%edx # (count known to fit in 32 bits) \n\`
			`mov %%edx, %%eax # Leave remainder in ECX for later \n\`
			`shl $4, %%eax \n\`
			`sub %%eax, %%ecx \n\`
			`.balign 16 \n\`
			`0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\`
			`movdqa %%xmm0, (%%rdi) \n\`
			`add $16, %%rsi # Update pointers \n\`
			`add $16, %%rdi \n\`
			`dec %%edx # And loop \n\`
			`jnz 0b \n\`
			`jmp amd64.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`.balign 16 \n\`
			`nop \n\`
			`nop \n\`
			`amd64.memcpy_fast_bp: # Fast block prefetch loop \n"`
			`AMD64_BLOCK_MEMCPY(movdqa)`
			`" jmp amd64.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`.balign 16 \n\`
			`nop \n\`
			`nop \n\`
			`amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n"`
			`AMD64_BLOCK_MEMCPY(movdqu)`
			`" \n\`
			`amd64.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>3 \n\`
			`shr $3, %%eax \n\`
			`lea 0f(%%rip), %%rdx \n\`
			`add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\`
			`sub %%rax, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSQ's \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`0: and $7, %%ecx # ECX <- ECX & 7 \n\`
			`lea 0f(%%rip), %%rdx \n\`
			`sub %%rcx, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: \n\`
			`# All done! \n\`
			`emms # Clean up after MMX instructions \n\`
			`sfence # Flush the write buffer \n\`
			`pop %%rdi # Restore destination (return value) \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%rax", "%rbx", "%rdx"`
			`);`
			`return dest;`
			`}`

			`#endif /* HAVE_ASM_SSE2 && ARCH_X86_64 */`

			`/*************************************************************************/`

			`/* Initialization routine. */`

			`int ac_memcpy_init(int accel)`
			`{`
			`memcpy_ptr = memmove;`

			`#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)`
			`if (HAS_ACCEL(accel, AC_MMX))`
			`memcpy_ptr = memcpy_mmx;`
			`#endif`

			`#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)`
			`if (HAS_ACCEL(accel, AC_CMOVE\|AC_SSE))`
			`memcpy_ptr = memcpy_sse;`
			`#endif`

			`#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)`
			`if (HAS_ACCEL(accel, AC_CMOVE\|AC_SSE2))`
			`memcpy_ptr = memcpy_amd64;`
			`#endif`

			`return 1;`
			`}`

			`/*************************************************************************/`

			`/*`
			`* Local variables:`
			`* c-file-style: "stroustrup"`
			`* c-file-offsets: ((case-label . ) (statement-case-intro . ))`
			`* indent-tabs-mode: nil`
			`* End:`
			`*`
			`* vim: expandtab shiftwidth=4:`
			`*/`