k9copy/k9vamps/tcmemcpy.cpp

/*
 * tcmemcpy.c - optimized memcpy() routines for transcode
 * Written by Andrew Church <achurch@achurch.org>
 */

#include <string.h>
#include <stdio.h>
#include "ac.h"

/*************************************************************************/

#if defined(ARCH_X86)

/* MMX-optimized routine, intended for PMMX/PII processors.
 * Nonstandard instructions used:
 *     (CPUID.MMX)   MOVQ
 */

void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes)
{
    asm("\
PENTIUM_LINE_SIZE = 32		# PMMX/PII cache line size		\n\
PENTIUM_CACHE_SIZE = 8192	# PMMX/PII total cache size		\n\
# Use only half because writes may touch the cache too (PII)		\n\
PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)	\n\
									\n\
	push %%ebx		# Save PIC register			\n\
	push %%edi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	mov $64, %%ebx		# Constant				\n\
									\n\
	cmp %%ebx, %%ecx						\n\
	jb mmx.memcpy_last	# Just use movs if <64 bytes		\n\
									\n\
	# First align destination address to a multiple of 8 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax						\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%eax, %%ecx	# Update count				\n\
									\n\
	# Now copy data in blocks					\n\
0:	mov %%ecx, %%edx	# EDX <- ECX >> 6 (cache lines to copy)	\n\
	shr $6, %%edx							\n\
	jz mmx.memcpy_last	# <64 bytes left?  Skip to end		\n\
	cmp $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
	jb 1f			# Limit size of block			\n\
	mov $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
1:	mov %%edx, %%eax	# EAX <- EDX << 6 (bytes to copy)	\n\
	shl $6, %%eax							\n\
	sub %%eax, %%ecx	# Update remaining count		\n\
	add %%eax, %%esi	# Point to end of region to be block-copied\n\
2:	test %%eax, -32(%%esi)	# Touch each cache line in reverse order\n\
	test %%eax, -64(%%esi)						\n\
	sub %%ebx, %%esi	# Update pointer			\n\
	sub %%ebx, %%eax	# And loop				\n\
	jnz 2b								\n\
	# Note that ESI now points to the beginning of the block	\n\
3:	movq   (%%esi), %%mm0	# Do the actual copy, 64 bytes at a time\n\
	movq  8(%%esi), %%mm1						\n\
	movq 16(%%esi), %%mm2						\n\
	movq 24(%%esi), %%mm3						\n\
	movq 32(%%esi), %%mm4						\n\
	movq 40(%%esi), %%mm5						\n\
	movq 48(%%esi), %%mm6						\n\
	movq 56(%%esi), %%mm7						\n\
	movq %%mm0,   (%%edi)						\n\
	movq %%mm1,  8(%%edi)						\n\
	movq %%mm2, 16(%%edi)						\n\
	movq %%mm3, 24(%%edi)						\n\
	movq %%mm4, 32(%%edi)						\n\
	movq %%mm5, 40(%%edi)						\n\
	movq %%mm6, 48(%%edi)						\n\
	movq %%mm7, 56(%%edi)						\n\
	add %%ebx, %%esi	# Update pointers			\n\
	add %%ebx, %%edi						\n\
	dec %%edx		# And loop				\n\
	jnz 3b								\n\
	jmp 0b								\n\
									\n\
mmx.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
	shr $2, %%eax							\n\
	lea 0f, %%edx							\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
	lea 0f, %%edx							\n\
	sub %%ecx, %%edx						\n\
	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:									\n\
	# All done!							\n\
	emms			# Clean up MMX state			\n\
	pop %%edi		# Restore destination (return value)	\n\
	pop %%ebx		# Restore PIC register			\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* ARCH_X86 */

/*************************************************************************/

#if defined(ARCH_X86)

/* SSE-optimized routine.  Backported from AMD64 routine below.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.MMX)   MOVQ
 *     (CPUID.SSE)   MOVNTQ
 */

void *ac_memcpy_sse(void *dest, const void *src, size_t bytes)
{
    asm("\
	push %%ebx		# Save PIC register			\n\
	push %%edi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	cmp $64, %%ecx		# Skip block copy for small blocks	\n\
	jb sse.memcpy_last						\n\
									\n\
	mov $128, %%ebx		# Constant used later			\n\
									\n\
	# First align destination address to a multiple of 8 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax						\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%eax, %%ecx	# Update count				\n\
									\n\
	cmp $0x10040, %%ecx	# Is this a large block? (0x10040 is an	\n\
				# arbitrary value where prefetching and	\n\
				# write combining seem to start becoming\n\
				# faster)				\n\
	jae sse.memcpy_bp	# Yup, use prefetch copy		\n\
									\n\
sse.memcpy_small:		# Small block copy routine--no prefetch	\n"
#if 0
"	mov %%ecx, %%edx	# EDX <- bytes to copy / 8		\n\
	shr $3, %%edx							\n\
	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
	shl $3, %%eax							\n\
	sub %%eax, %%ecx						\n\
	.align 16							\n\
0:	movq (%%esi), %%mm0	# Copy 8 bytes of data			\n\
	movq %%mm0, (%%edi)						\n\
	add $8, %%esi		# Update pointers			\n\
	add $8, %%edi							\n\
	dec %%edx		# And loop				\n\
	jg 0b								\n\
	jmp sse.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	nop			# Align loops below			\n"
#else
"	# It appears that a simple rep movs is faster than cleverness	\n\
	# with movq...							\n\
	mov %%ecx, %%edx	# EDX <- ECX & 3			\n\
	and $0b11, %%edx						\n\
	shr $2, %%ecx		# ECX <- ECX >> 2			\n\
	rep movsl		# Copy away!				\n\
	mov %%edx, %%ecx	# Take care of last 0-3 bytes		\n\
	rep movsb							\n\
	jmp sse.memcpy_end	# And exit				\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n"
#endif
"sse.memcpy_bp:			# Block prefetch copy routine		\n\
0:	mov %%ecx, %%edx	# EDX: temp counter			\n\
	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
	cmova %%ebx, %%edx						\n\
	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
				#        (also used as memory offset)	\n\
1:	test %%eax, -64(%%esi,%%eax,8)	# Preload cache lines in pairs	\n\
	test %%eax, -128(%%esi,%%eax,8)	# (going backwards)		\n\
	# (note that test %%eax,... seems to be faster than prefetchnta	\n\
	#  on x86)							\n\
	sub $16, %%eax		# And loop				\n\
	jg 1b								\n\
									\n\
	# Then copy--forward, which seems to be faster than reverse for	\n\
	# certain alignments						\n\
	xor %%eax, %%eax						\n\
2:	movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop		\n\
	movntq %%mm0, (%%edi,%%eax,8)					\n\
	inc %%eax							\n\
	cmp %%edx, %%eax						\n\
	jb 2b								\n\
									\n\
	# Finally, update pointers and count, and loop			\n\
	shl $3, %%edx		# EDX <- bytes copied			\n\
	add %%edx, %%esi						\n\
	add %%edx, %%edi						\n\
	sub %%edx, %%ecx						\n\
	cmp $64, %%ecx		# At least one cache line left?		\n\
	jae 0b			# Yup, loop				\n\
									\n\
sse.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
	shr $2, %%eax							\n\
	lea 0f, %%edx							\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
	lea sse.memcpy_end, %%edx					\n\
	sub %%ecx, %%edx						\n\
	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
									\n\
sse.memcpy_end:								\n\
	# All done!							\n\
	emms			# Clean up after MMX instructions	\n\
	sfence			# Flush the write buffer		\n\
	pop %%edi		# Restore destination (return value)	\n\
	pop %%ebx		# Restore PIC register			\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* ARCH_X86 */

/*************************************************************************/

#if defined(ARCH_X86_64)

/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
 * guide section 5.13: Appropriate Memory Copying Routines.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
 *
 * Note that this routine will also run more or less as-is (modulo register
 * names and label(%%rip) references) on x86 CPUs, but tests have shown the
 * SSE1 version above to be faster.
 */

/* The block copying code--macroized because we use two versions of it
 * depending on whether the source is 16-byte-aligned or not.  Pass either
 * movdqa or movdqu (unquoted) for the parameter. */
#define AMD64_BLOCK_MEMCPY(movdq) \
"	# First prefetch (note that if we end on an odd number of cache	\n\
	# lines, we skip prefetching the last one--faster that way than	\n\
	# prefetching line by line or treating it as a special case)	\n\
0:	mov %%ecx, %%edx	# EDX: temp counter (always <32 bits)	\n\
	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
	cmova %%ebx, %%edx						\n\
	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
				#        (also used as memory offset)	\n\
1:	prefetchnta -64(%%rsi,%%rax,8)	# Preload cache lines in pairs	\n\
	prefetchnta -128(%%rsi,%%rax,8)	# (going backwards)		\n\
	sub $16, %%eax		# And loop				\n\
	jg 1b								\n\
									\n\
	# Then copy--forward, which seems to be faster than reverse for	\n\
	# certain alignments						\n\
	xor %%eax, %%eax						\n\
2:	" #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop	\n\
	movntdq %%xmm0, (%%rdi,%%rax,8)					\n\
	add $2, %%eax							\n\
	cmp %%edx, %%eax						\n\
	jb 2b								\n\
									\n\
	# Finally, update pointers and count, and loop			\n\
	shl $3, %%edx		# EDX <- bytes copied			\n\
	add %%rdx, %%rsi						\n\
	add %%rdx, %%rdi						\n\
	sub %%rdx, %%rcx						\n\
	cmp $64, %%rcx		# At least one cache line left?		\n\
	jae 0b			# Yup, loop				\n"

void *ac_memcpy_amd64(void *dest, const void *src, size_t bytes)
{
    asm("\
	push %%rdi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	cmp $64, %%rcx		# Skip block copy for small blocks	\n\
	jb amd64.memcpy_last						\n\
									\n\
	mov $128, %%ebx		# Constant used later			\n\
									\n\
	# First align destination address to a multiple of 16 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax	# (we don't care about the top 32 bits)	\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f(%%rip), %%rdx	# Use a computed jump--faster than a loop\n\
	sub %%rax, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%rax, %%rcx	# Update count				\n\
	test $0b1000, %%edi	# Is destination not 16-byte aligned?	\n\
	je 1f								\n\
	movsq			# Then move 8 bytes to align it		\n\
	sub $8, %%rcx							\n\
									\n\
1:	cmp $0x38000, %%rcx	# Is this a large block? (0x38000 is an	\n\
				# arbitrary value where prefetching and	\n\
				# write combining seem to start becoming\n\
				# faster)				\n\
	jb amd64.memcpy_small	# Nope, use small copy (no prefetch/WC)	\n\
	test $0b1111, %%esi	# Is source also 16-byte aligned?	\n\
				# (use ESI to save a REX prefix byte)	\n\
	jnz amd64.memcpy_normal_bp # Nope, use slow copy		\n\
	jmp amd64.memcpy_fast_bp # Yup, use fast copy			\n\
									\n\
amd64.memcpy_small:		# Small block copy routine--no prefetch	\n\
	mov %%ecx, %%edx	# EDX <- bytes to copy / 16		\n\
	shr $4, %%edx		# (count known to fit in 32 bits)	\n\
	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
	shl $4, %%eax							\n\
	sub %%eax, %%ecx						\n\
	.align 16							\n\
0:	movdqu (%%rsi), %%xmm0	# Copy 16 bytes of data			\n\
	movdqa %%xmm0, (%%rdi)						\n\
	add $16, %%rsi		# Update pointers			\n\
	add $16, %%rdi							\n\
	dec %%edx		# And loop				\n\
	jnz 0b								\n\
	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n\
amd64.memcpy_fast_bp:		# Fast block prefetch loop		\n"
AMD64_BLOCK_MEMCPY(movdqa)
"	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n\
amd64.memcpy_normal_bp:		# Normal (unaligned) block prefetch loop\n"
AMD64_BLOCK_MEMCPY(movdqu)
"									\n\
amd64.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>3				\n\
	shr $3, %%eax							\n\
	lea 0f(%%rip), %%rdx						\n\
	add %%eax, %%eax	# Watch out, MOVSQ is 2 bytes!		\n\
	sub %%rax, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSQ's			\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
0:	and $0b111, %%ecx	# ECX <- ECX & 7			\n\
	lea 0f(%%rip), %%rdx						\n\
	sub %%rcx, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:									\n\
	# All done!							\n\
	emms			# Clean up after MMX instructions	\n\
	sfence			# Flush the write buffer		\n\
	pop %%rdi		# Restore destination (return value)	\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%rax", "%rbx", "%rdx"
    );
    return dest;
}

#endif  /* ARCH_X86_64 */

/*************************************************************************/

void * (*tc_memcpy)(void *, const void *, size_t) = memcpy;

void tc_memcpy_init(int verbose, int mmflags)
{
	const char * method = "libc";

#if defined(ARCH_X86) || defined(ARCH_X86_64)
	int accel = mmflags == -1 ? ac_mmflag() : mmflags;
#endif

#if defined(ARCH_X86)
	if((accel & MM_CMOVE) && (accel & MM_SSE))
	{
		method = "sse";
		tc_memcpy = ac_memcpy_sse;
	}
	else if(accel & MM_MMX)
	{
		method = "mmx";
		tc_memcpy = ac_memcpy_mmx;
	}
#endif

#if defined(ARCH_X86_64)
	if((accel & MM_CMOVE) && (accel & MM_SSE2))
	{
		method = "amd64";
		tc_memcpy = ac_memcpy_amd64;
	}
#endif

	if(verbose)
		fprintf(stderr, "tc_memcpy: using %s for memcpy\n", method);
}
Added old abandoned version of k9copy git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/k9copy@1091546 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`/*`
			`* tcmemcpy.c - optimized memcpy() routines for transcode`
			`* Written by Andrew Church <achurch@achurch.org>`
			`*/`

			`#include <string.h>`
			`#include <stdio.h>`
			`#include "ac.h"`

			`/*************************************************************************/`

			`#if defined(ARCH_X86)`

			`/* MMX-optimized routine, intended for PMMX/PII processors.`
			`* Nonstandard instructions used:`
			`* (CPUID.MMX) MOVQ`
			`*/`

			`void ac_memcpy_mmx(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\`
			`PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\`
			`# Use only half because writes may touch the cache too (PII) \n\`
			`PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\`
			`\n\`
			`push %%ebx # Save PIC register \n\`
			`push %%edi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`mov $64, %%ebx # Constant \n\`
			`\n\`
			`cmp %%ebx, %%ecx \n\`
			`jb mmx.memcpy_last # Just use movs if <64 bytes \n\`
			`\n\`
			`# First align destination address to a multiple of 8 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax \n\`
			`and $0b111, %%eax # ... which is the number of bytes to copy\n\`
			`lea 0f, %%edx # Use a computed jump--faster than a loop\n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: sub %%eax, %%ecx # Update count \n\`
			`\n\`
			`# Now copy data in blocks \n\`
			`0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\`
			`shr $6, %%edx \n\`
			`jz mmx.memcpy_last # <64 bytes left? Skip to end \n\`
			`cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\`
			`jb 1f # Limit size of block \n\`
			`mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\`
			`1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\`
			`shl $6, %%eax \n\`
			`sub %%eax, %%ecx # Update remaining count \n\`
			`add %%eax, %%esi # Point to end of region to be block-copied\n\`
			`2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\`
			`test %%eax, -64(%%esi) \n\`
			`sub %%ebx, %%esi # Update pointer \n\`
			`sub %%ebx, %%eax # And loop \n\`
			`jnz 2b \n\`
			`# Note that ESI now points to the beginning of the block \n\`
			`3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\`
			`movq 8(%%esi), %%mm1 \n\`
			`movq 16(%%esi), %%mm2 \n\`
			`movq 24(%%esi), %%mm3 \n\`
			`movq 32(%%esi), %%mm4 \n\`
			`movq 40(%%esi), %%mm5 \n\`
			`movq 48(%%esi), %%mm6 \n\`
			`movq 56(%%esi), %%mm7 \n\`
			`movq %%mm0, (%%edi) \n\`
			`movq %%mm1, 8(%%edi) \n\`
			`movq %%mm2, 16(%%edi) \n\`
			`movq %%mm3, 24(%%edi) \n\`
			`movq %%mm4, 32(%%edi) \n\`
			`movq %%mm5, 40(%%edi) \n\`
			`movq %%mm6, 48(%%edi) \n\`
			`movq %%mm7, 56(%%edi) \n\`
			`add %%ebx, %%esi # Update pointers \n\`
			`add %%ebx, %%edi \n\`
			`dec %%edx # And loop \n\`
			`jnz 3b \n\`
			`jmp 0b \n\`
			`\n\`
			`mmx.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>2 \n\`
			`shr $2, %%eax \n\`
			`lea 0f, %%edx \n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-15 MOVSD's \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`0: and $0b11, %%ecx # ECX <- ECX & 3 \n\`
			`lea 0f, %%edx \n\`
			`sub %%ecx, %%edx \n\`
			`jmp *%%edx # Execute 0-3 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: \n\`
			`# All done! \n\`
			`emms # Clean up MMX state \n\`
			`pop %%edi # Restore destination (return value) \n\`
			`pop %%ebx # Restore PIC register \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%eax", "%edx"`
			`);`
			`return dest;`
			`}`

			`#endif /* ARCH_X86 */`

			`/*************************************************************************/`

			`#if defined(ARCH_X86)`

			`/* SSE-optimized routine. Backported from AMD64 routine below.`
			`* Nonstandard instructions used:`
			`* (CPUID.CMOVE) CMOVA`
			`* (CPUID.MMX) MOVQ`
			`* (CPUID.SSE) MOVNTQ`
			`*/`

			`void ac_memcpy_sse(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`push %%ebx # Save PIC register \n\`
			`push %%edi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`cmp $64, %%ecx # Skip block copy for small blocks \n\`
			`jb sse.memcpy_last \n\`
			`\n\`
			`mov $128, %%ebx # Constant used later \n\`
			`\n\`
			`# First align destination address to a multiple of 8 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax \n\`
			`and $0b111, %%eax # ... which is the number of bytes to copy\n\`
			`lea 0f, %%edx # Use a computed jump--faster than a loop\n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: sub %%eax, %%ecx # Update count \n\`
			`\n\`
			`cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\`
			`# arbitrary value where prefetching and \n\`
			`# write combining seem to start becoming\n\`
			`# faster) \n\`
			`jae sse.memcpy_bp # Yup, use prefetch copy \n\`
			`\n\`
			`sse.memcpy_small: # Small block copy routine--no prefetch \n"`
			`#if 0`
			`" mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\`
			`shr $3, %%edx \n\`
			`mov %%edx, %%eax # Leave remainder in ECX for later \n\`
			`shl $3, %%eax \n\`
			`sub %%eax, %%ecx \n\`
			`.align 16 \n\`
			`0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\`
			`movq %%mm0, (%%edi) \n\`
			`add $8, %%esi # Update pointers \n\`
			`add $8, %%edi \n\`
			`dec %%edx # And loop \n\`
			`jg 0b \n\`
			`jmp sse.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`nop # Align loops below \n"`
			`#else`
			`" # It appears that a simple rep movs is faster than cleverness \n\`
			`# with movq... \n\`
			`mov %%ecx, %%edx # EDX <- ECX & 3 \n\`
			`and $0b11, %%edx \n\`
			`shr $2, %%ecx # ECX <- ECX >> 2 \n\`
			`rep movsl # Copy away! \n\`
			`mov %%edx, %%ecx # Take care of last 0-3 bytes \n\`
			`rep movsb \n\`
			`jmp sse.memcpy_end # And exit \n\`
			`\n\`
			`.align 16 \n\`
			`nop \n\`
			`nop \n"`
			`#endif`
			`"sse.memcpy_bp: # Block prefetch copy routine \n\`
			`0: mov %%ecx, %%edx # EDX: temp counter \n\`
			`shr $6, %%edx # Divide by cache line size (64 bytes) \n\`
			`cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\`
			`cmova %%ebx, %%edx \n\`
			`shl $3, %%edx # EDX <- cache lines to copy * 8 \n\`
			`mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\`
			`# (also used as memory offset) \n\`
			`1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\`
			`test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\`
			`# (note that test %%eax,... seems to be faster than prefetchnta \n\`
			`# on x86) \n\`
			`sub $16, %%eax # And loop \n\`
			`jg 1b \n\`
			`\n\`
			`# Then copy--forward, which seems to be faster than reverse for \n\`
			`# certain alignments \n\`
			`xor %%eax, %%eax \n\`
			`2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\`
			`movntq %%mm0, (%%edi,%%eax,8) \n\`
			`inc %%eax \n\`
			`cmp %%edx, %%eax \n\`
			`jb 2b \n\`
			`\n\`
			`# Finally, update pointers and count, and loop \n\`
			`shl $3, %%edx # EDX <- bytes copied \n\`
			`add %%edx, %%esi \n\`
			`add %%edx, %%edi \n\`
			`sub %%edx, %%ecx \n\`
			`cmp $64, %%ecx # At least one cache line left? \n\`
			`jae 0b # Yup, loop \n\`
			`\n\`
			`sse.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>2 \n\`
			`shr $2, %%eax \n\`
			`lea 0f, %%edx \n\`
			`sub %%eax, %%edx \n\`
			`jmp *%%edx # Execute 0-15 MOVSD's \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`movsd \n\`
			`0: and $0b11, %%ecx # ECX <- ECX & 3 \n\`
			`lea sse.memcpy_end, %%edx \n\`
			`sub %%ecx, %%edx \n\`
			`jmp *%%edx # Execute 0-3 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`\n\`
			`sse.memcpy_end: \n\`
			`# All done! \n\`
			`emms # Clean up after MMX instructions \n\`
			`sfence # Flush the write buffer \n\`
			`pop %%edi # Restore destination (return value) \n\`
			`pop %%ebx # Restore PIC register \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%eax", "%edx"`
			`);`
			`return dest;`
			`}`

			`#endif /* ARCH_X86 */`

			`/*************************************************************************/`

			`#if defined(ARCH_X86_64)`

			`/* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization`
			`* guide section 5.13: Appropriate Memory Copying Routines.`
			`* Nonstandard instructions used:`
			`* (CPUID.CMOVE) CMOVA`
			`* (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ`
			`*`
			`* Note that this routine will also run more or less as-is (modulo register`
			`* names and label(%%rip) references) on x86 CPUs, but tests have shown the`
			`* SSE1 version above to be faster.`
			`*/`

			`/* The block copying code--macroized because we use two versions of it`
			`* depending on whether the source is 16-byte-aligned or not. Pass either`
			`* movdqa or movdqu (unquoted) for the parameter. */`
			`#define AMD64_BLOCK_MEMCPY(movdq) \`
			`" # First prefetch (note that if we end on an odd number of cache \n\`
			`# lines, we skip prefetching the last one--faster that way than \n\`
			`# prefetching line by line or treating it as a special case) \n\`
			`0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\`
			`shr $6, %%edx # Divide by cache line size (64 bytes) \n\`
			`cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\`
			`cmova %%ebx, %%edx \n\`
			`shl $3, %%edx # EDX <- cache lines to copy * 8 \n\`
			`mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\`
			`# (also used as memory offset) \n\`
			`1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\`
			`prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\`
			`sub $16, %%eax # And loop \n\`
			`jg 1b \n\`
			`\n\`
			`# Then copy--forward, which seems to be faster than reverse for \n\`
			`# certain alignments \n\`
			`xor %%eax, %%eax \n\`
			`2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\`
			`movntdq %%xmm0, (%%rdi,%%rax,8) \n\`
			`add $2, %%eax \n\`
			`cmp %%edx, %%eax \n\`
			`jb 2b \n\`
			`\n\`
			`# Finally, update pointers and count, and loop \n\`
			`shl $3, %%edx # EDX <- bytes copied \n\`
			`add %%rdx, %%rsi \n\`
			`add %%rdx, %%rdi \n\`
			`sub %%rdx, %%rcx \n\`
			`cmp $64, %%rcx # At least one cache line left? \n\`
			`jae 0b # Yup, loop \n"`

			`void ac_memcpy_amd64(void dest, const void *src, size_t bytes)`
			`{`
			`asm("\`
			`push %%rdi # Save destination for return value \n\`
			`cld # MOVS* should ascend \n\`
			`\n\`
			`cmp $64, %%rcx # Skip block copy for small blocks \n\`
			`jb amd64.memcpy_last \n\`
			`\n\`
			`mov $128, %%ebx # Constant used later \n\`
			`\n\`
			`# First align destination address to a multiple of 16 bytes \n\`
			`mov $8, %%eax # EAX <- (8-dest) & 7 \n\`
			`sub %%edi, %%eax # (we don't care about the top 32 bits) \n\`
			`and $0b111, %%eax # ... which is the number of bytes to copy\n\`
			`lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\`
			`sub %%rax, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: sub %%rax, %%rcx # Update count \n\`
			`test $0b1000, %%edi # Is destination not 16-byte aligned? \n\`
			`je 1f \n\`
			`movsq # Then move 8 bytes to align it \n\`
			`sub $8, %%rcx \n\`
			`\n\`
			`1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\`
			`# arbitrary value where prefetching and \n\`
			`# write combining seem to start becoming\n\`
			`# faster) \n\`
			`jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\`
			`test $0b1111, %%esi # Is source also 16-byte aligned? \n\`
			`# (use ESI to save a REX prefix byte) \n\`
			`jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\`
			`jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\`
			`\n\`
			`amd64.memcpy_small: # Small block copy routine--no prefetch \n\`
			`mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\`
			`shr $4, %%edx # (count known to fit in 32 bits) \n\`
			`mov %%edx, %%eax # Leave remainder in ECX for later \n\`
			`shl $4, %%eax \n\`
			`sub %%eax, %%ecx \n\`
			`.align 16 \n\`
			`0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\`
			`movdqa %%xmm0, (%%rdi) \n\`
			`add $16, %%rsi # Update pointers \n\`
			`add $16, %%rdi \n\`
			`dec %%edx # And loop \n\`
			`jnz 0b \n\`
			`jmp amd64.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`.align 16 \n\`
			`nop \n\`
			`nop \n\`
			`amd64.memcpy_fast_bp: # Fast block prefetch loop \n"`
			`AMD64_BLOCK_MEMCPY(movdqa)`
			`" jmp amd64.memcpy_last # Copy any remaining bytes \n\`
			`\n\`
			`.align 16 \n\`
			`nop \n\`
			`nop \n\`
			`amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n"`
			`AMD64_BLOCK_MEMCPY(movdqu)`
			`" \n\`
			`amd64.memcpy_last: \n\`
			`# Copy last <64 bytes, using the computed jump trick \n\`
			`mov %%ecx, %%eax # EAX <- ECX>>3 \n\`
			`shr $3, %%eax \n\`
			`lea 0f(%%rip), %%rdx \n\`
			`add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\`
			`sub %%rax, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSQ's \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`movsq \n\`
			`0: and $0b111, %%ecx # ECX <- ECX & 7 \n\`
			`lea 0f(%%rip), %%rdx \n\`
			`sub %%rcx, %%rdx \n\`
			`jmp *%%rdx # Execute 0-7 MOVSB's \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`movsb \n\`
			`0: \n\`
			`# All done! \n\`
			`emms # Clean up after MMX instructions \n\`
			`sfence # Flush the write buffer \n\`
			`pop %%rdi # Restore destination (return value) \n\`
			`" : /* no outputs */`
			`: "D" (dest), "S" (src), "c" (bytes)`
			`: "%rax", "%rbx", "%rdx"`
			`);`
			`return dest;`
			`}`

			`#endif /* ARCH_X86_64 */`

			`/*************************************************************************/`

			`void * (tc_memcpy)(void , const void *, size_t) = memcpy;`

			`void tc_memcpy_init(int verbose, int mmflags)`
			`{`
			`const char * method = "libc";`

			`#if defined(ARCH_X86) \|\| defined(ARCH_X86_64)`
			`int accel = mmflags == -1 ? ac_mmflag() : mmflags;`
			`#endif`

			`#if defined(ARCH_X86)`
			`if((accel & MM_CMOVE) && (accel & MM_SSE))`
			`{`
			`method = "sse";`
			`tc_memcpy = ac_memcpy_sse;`
			`}`
			`else if(accel & MM_MMX)`
			`{`
			`method = "mmx";`
			`tc_memcpy = ac_memcpy_mmx;`
			`}`
			`#endif`

			`#if defined(ARCH_X86_64)`
			`if((accel & MM_CMOVE) && (accel & MM_SSE2))`
			`{`
			`method = "amd64";`
			`tc_memcpy = ac_memcpy_amd64;`
			`}`
			`#endif`

			`if(verbose)`
			`fprintf(stderr, "tc_memcpy: using %s for memcpy\n", method);`
			`}`