/*
 * tcmemcpy.c - optimized memcpy() routines for transcode
 * Written by Andrew Church <achurch@achurch.org>
 */

#include <string.h>
#include <stdio.h>
#include "ac.h"

/*************************************************************************/

#if defined(ARCH_X86)

/* MMX-optimized routine, intended for PMMX/PII processors.
 * Nonstandard instructions used:
 *     (CPUID.MMX)   MOVQ
 */

void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes)
{
    asm("\
PENTIUM_LINE_SIZE = 32		# PMMX/PII cache line size		\n\
PENTIUM_CACHE_SIZE = 8192	# PMMX/PII total cache size		\n\
# Use only half because writes may touch the cache too (PII)		\n\
PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)	\n\
									\n\
	push %%ebx		# Save PIC register			\n\
	push %%edi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	mov $64, %%ebx		# Constant				\n\
									\n\
	cmp %%ebx, %%ecx						\n\
	jb mmx.memcpy_last	# Just use movs if <64 bytes		\n\
									\n\
	# First align destination address to a multiple of 8 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax						\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%eax, %%ecx	# Update count				\n\
									\n\
	# Now copy data in blocks					\n\
0:	mov %%ecx, %%edx	# EDX <- ECX >> 6 (cache lines to copy)	\n\
	shr $6, %%edx							\n\
	jz mmx.memcpy_last	# <64 bytes left?  Skip to end		\n\
	cmp $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
	jb 1f			# Limit size of block			\n\
	mov $PENTIUM_CACHE_BLOCK/64, %%edx				\n\
1:	mov %%edx, %%eax	# EAX <- EDX << 6 (bytes to copy)	\n\
	shl $6, %%eax							\n\
	sub %%eax, %%ecx	# Update remaining count		\n\
	add %%eax, %%esi	# Point to end of region to be block-copied\n\
2:	test %%eax, -32(%%esi)	# Touch each cache line in reverse order\n\
	test %%eax, -64(%%esi)						\n\
	sub %%ebx, %%esi	# Update pointer			\n\
	sub %%ebx, %%eax	# And loop				\n\
	jnz 2b								\n\
	# Note that ESI now points to the beginning of the block	\n\
3:	movq   (%%esi), %%mm0	# Do the actual copy, 64 bytes at a time\n\
	movq  8(%%esi), %%mm1						\n\
	movq 16(%%esi), %%mm2						\n\
	movq 24(%%esi), %%mm3						\n\
	movq 32(%%esi), %%mm4						\n\
	movq 40(%%esi), %%mm5						\n\
	movq 48(%%esi), %%mm6						\n\
	movq 56(%%esi), %%mm7						\n\
	movq %%mm0,   (%%edi)						\n\
	movq %%mm1,  8(%%edi)						\n\
	movq %%mm2, 16(%%edi)						\n\
	movq %%mm3, 24(%%edi)						\n\
	movq %%mm4, 32(%%edi)						\n\
	movq %%mm5, 40(%%edi)						\n\
	movq %%mm6, 48(%%edi)						\n\
	movq %%mm7, 56(%%edi)						\n\
	add %%ebx, %%esi	# Update pointers			\n\
	add %%ebx, %%edi						\n\
	dec %%edx		# And loop				\n\
	jnz 3b								\n\
	jmp 0b								\n\
									\n\
mmx.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
	shr $2, %%eax							\n\
	lea 0f, %%edx							\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
	lea 0f, %%edx							\n\
	sub %%ecx, %%edx						\n\
	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:									\n\
	# All done!							\n\
	emms			# Clean up MMX state			\n\
	pop %%edi		# Restore destination (return value)	\n\
	pop %%ebx		# Restore PIC register			\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* ARCH_X86 */

/*************************************************************************/

#if defined(ARCH_X86)

/* SSE-optimized routine.  Backported from AMD64 routine below.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.MMX)   MOVQ
 *     (CPUID.SSE)   MOVNTQ
 */

void *ac_memcpy_sse(void *dest, const void *src, size_t bytes)
{
    asm("\
	push %%ebx		# Save PIC register			\n\
	push %%edi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	cmp $64, %%ecx		# Skip block copy for small blocks	\n\
	jb sse.memcpy_last						\n\
									\n\
	mov $128, %%ebx		# Constant used later			\n\
									\n\
	# First align destination address to a multiple of 8 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax						\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f, %%edx		# Use a computed jump--faster than a loop\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%eax, %%ecx	# Update count				\n\
									\n\
	cmp $0x10040, %%ecx	# Is this a large block? (0x10040 is an	\n\
				# arbitrary value where prefetching and	\n\
				# write combining seem to start becoming\n\
				# faster)				\n\
	jae sse.memcpy_bp	# Yup, use prefetch copy		\n\
									\n\
sse.memcpy_small:		# Small block copy routine--no prefetch	\n"
#if 0
"	mov %%ecx, %%edx	# EDX <- bytes to copy / 8		\n\
	shr $3, %%edx							\n\
	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
	shl $3, %%eax							\n\
	sub %%eax, %%ecx						\n\
	.align 16							\n\
0:	movq (%%esi), %%mm0	# Copy 8 bytes of data			\n\
	movq %%mm0, (%%edi)						\n\
	add $8, %%esi		# Update pointers			\n\
	add $8, %%edi							\n\
	dec %%edx		# And loop				\n\
	jg 0b								\n\
	jmp sse.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	nop			# Align loops below			\n"
#else
"	# It appears that a simple rep movs is faster than cleverness	\n\
	# with movq...							\n\
	mov %%ecx, %%edx	# EDX <- ECX & 3			\n\
	and $0b11, %%edx						\n\
	shr $2, %%ecx		# ECX <- ECX >> 2			\n\
	rep movsl		# Copy away!				\n\
	mov %%edx, %%ecx	# Take care of last 0-3 bytes		\n\
	rep movsb							\n\
	jmp sse.memcpy_end	# And exit				\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n"
#endif
"sse.memcpy_bp:			# Block prefetch copy routine		\n\
0:	mov %%ecx, %%edx	# EDX: temp counter			\n\
	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
	cmova %%ebx, %%edx						\n\
	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
				#        (also used as memory offset)	\n\
1:	test %%eax, -64(%%esi,%%eax,8)	# Preload cache lines in pairs	\n\
	test %%eax, -128(%%esi,%%eax,8)	# (going backwards)		\n\
	# (note that test %%eax,... seems to be faster than prefetchnta	\n\
	#  on x86)							\n\
	sub $16, %%eax		# And loop				\n\
	jg 1b								\n\
									\n\
	# Then copy--forward, which seems to be faster than reverse for	\n\
	# certain alignments						\n\
	xor %%eax, %%eax						\n\
2:	movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop		\n\
	movntq %%mm0, (%%edi,%%eax,8)					\n\
	inc %%eax							\n\
	cmp %%edx, %%eax						\n\
	jb 2b								\n\
									\n\
	# Finally, update pointers and count, and loop			\n\
	shl $3, %%edx		# EDX <- bytes copied			\n\
	add %%edx, %%esi						\n\
	add %%edx, %%edi						\n\
	sub %%edx, %%ecx						\n\
	cmp $64, %%ecx		# At least one cache line left?		\n\
	jae 0b			# Yup, loop				\n\
									\n\
sse.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>2				\n\
	shr $2, %%eax							\n\
	lea 0f, %%edx							\n\
	sub %%eax, %%edx						\n\
	jmp *%%edx		# Execute 0-15 MOVSD's			\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
	movsd								\n\
0:	and $0b11, %%ecx	# ECX <- ECX & 3			\n\
	lea sse.memcpy_end, %%edx					\n\
	sub %%ecx, %%edx						\n\
	jmp *%%edx		# Execute 0-3 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
									\n\
sse.memcpy_end:								\n\
	# All done!							\n\
	emms			# Clean up after MMX instructions	\n\
	sfence			# Flush the write buffer		\n\
	pop %%edi		# Restore destination (return value)	\n\
	pop %%ebx		# Restore PIC register			\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* ARCH_X86 */

/*************************************************************************/

#if defined(ARCH_X86_64)

/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
 * guide section 5.13: Appropriate Memory Copying Routines.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
 *
 * Note that this routine will also run more or less as-is (modulo register
 * names and label(%%rip) references) on x86 CPUs, but tests have shown the
 * SSE1 version above to be faster.
 */

/* The block copying code--macroized because we use two versions of it
 * depending on whether the source is 16-byte-aligned or not.  Pass either
 * movdqa or movdqu (unquoted) for the parameter. */
#define AMD64_BLOCK_MEMCPY(movdq) \
"	# First prefetch (note that if we end on an odd number of cache	\n\
	# lines, we skip prefetching the last one--faster that way than	\n\
	# prefetching line by line or treating it as a special case)	\n\
0:	mov %%ecx, %%edx	# EDX: temp counter (always <32 bits)	\n\
	shr $6, %%edx		# Divide by cache line size (64 bytes)	\n\
	cmp %%ebx, %%edx	# ... and cap at 128 (8192 bytes)	\n\
	cmova %%ebx, %%edx						\n\
	shl $3, %%edx		# EDX <- cache lines to copy * 8	\n\
	mov %%edx, %%eax	# EAX <- cache lines to preload * 8	\n\
				#        (also used as memory offset)	\n\
1:	prefetchnta -64(%%rsi,%%rax,8)	# Preload cache lines in pairs	\n\
	prefetchnta -128(%%rsi,%%rax,8)	# (going backwards)		\n\
	sub $16, %%eax		# And loop				\n\
	jg 1b								\n\
									\n\
	# Then copy--forward, which seems to be faster than reverse for	\n\
	# certain alignments						\n\
	xor %%eax, %%eax						\n\
2:	" #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop	\n\
	movntdq %%xmm0, (%%rdi,%%rax,8)					\n\
	add $2, %%eax							\n\
	cmp %%edx, %%eax						\n\
	jb 2b								\n\
									\n\
	# Finally, update pointers and count, and loop			\n\
	shl $3, %%edx		# EDX <- bytes copied			\n\
	add %%rdx, %%rsi						\n\
	add %%rdx, %%rdi						\n\
	sub %%rdx, %%rcx						\n\
	cmp $64, %%rcx		# At least one cache line left?		\n\
	jae 0b			# Yup, loop				\n"

void *ac_memcpy_amd64(void *dest, const void *src, size_t bytes)
{
    asm("\
	push %%rdi		# Save destination for return value	\n\
	cld			# MOVS* should ascend			\n\
									\n\
	cmp $64, %%rcx		# Skip block copy for small blocks	\n\
	jb amd64.memcpy_last						\n\
									\n\
	mov $128, %%ebx		# Constant used later			\n\
									\n\
	# First align destination address to a multiple of 16 bytes	\n\
	mov $8, %%eax		# EAX <- (8-dest) & 7			\n\
	sub %%edi, %%eax	# (we don't care about the top 32 bits)	\n\
	and $0b111, %%eax	# ... which is the number of bytes to copy\n\
	lea 0f(%%rip), %%rdx	# Use a computed jump--faster than a loop\n\
	sub %%rax, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:	sub %%rax, %%rcx	# Update count				\n\
	test $0b1000, %%edi	# Is destination not 16-byte aligned?	\n\
	je 1f								\n\
	movsq			# Then move 8 bytes to align it		\n\
	sub $8, %%rcx							\n\
									\n\
1:	cmp $0x38000, %%rcx	# Is this a large block? (0x38000 is an	\n\
				# arbitrary value where prefetching and	\n\
				# write combining seem to start becoming\n\
				# faster)				\n\
	jb amd64.memcpy_small	# Nope, use small copy (no prefetch/WC)	\n\
	test $0b1111, %%esi	# Is source also 16-byte aligned?	\n\
				# (use ESI to save a REX prefix byte)	\n\
	jnz amd64.memcpy_normal_bp # Nope, use slow copy		\n\
	jmp amd64.memcpy_fast_bp # Yup, use fast copy			\n\
									\n\
amd64.memcpy_small:		# Small block copy routine--no prefetch	\n\
	mov %%ecx, %%edx	# EDX <- bytes to copy / 16		\n\
	shr $4, %%edx		# (count known to fit in 32 bits)	\n\
	mov %%edx, %%eax	# Leave remainder in ECX for later	\n\
	shl $4, %%eax							\n\
	sub %%eax, %%ecx						\n\
	.align 16							\n\
0:	movdqu (%%rsi), %%xmm0	# Copy 16 bytes of data			\n\
	movdqa %%xmm0, (%%rdi)						\n\
	add $16, %%rsi		# Update pointers			\n\
	add $16, %%rdi							\n\
	dec %%edx		# And loop				\n\
	jnz 0b								\n\
	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n\
amd64.memcpy_fast_bp:		# Fast block prefetch loop		\n"
AMD64_BLOCK_MEMCPY(movdqa)
"	jmp amd64.memcpy_last	# Copy any remaining bytes		\n\
									\n\
	.align 16							\n\
	nop								\n\
	nop								\n\
amd64.memcpy_normal_bp:		# Normal (unaligned) block prefetch loop\n"
AMD64_BLOCK_MEMCPY(movdqu)
"									\n\
amd64.memcpy_last:							\n\
	# Copy last <64 bytes, using the computed jump trick		\n\
	mov %%ecx, %%eax	# EAX <- ECX>>3				\n\
	shr $3, %%eax							\n\
	lea 0f(%%rip), %%rdx						\n\
	add %%eax, %%eax	# Watch out, MOVSQ is 2 bytes!		\n\
	sub %%rax, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSQ's			\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
	movsq								\n\
0:	and $0b111, %%ecx	# ECX <- ECX & 7			\n\
	lea 0f(%%rip), %%rdx						\n\
	sub %%rcx, %%rdx						\n\
	jmp *%%rdx		# Execute 0-7 MOVSB's			\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
	movsb								\n\
0:									\n\
	# All done!							\n\
	emms			# Clean up after MMX instructions	\n\
	sfence			# Flush the write buffer		\n\
	pop %%rdi		# Restore destination (return value)	\n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%rax", "%rbx", "%rdx"
    );
    return dest;
}

#endif  /* ARCH_X86_64 */

/*************************************************************************/

void * (*tc_memcpy)(void *, const void *, size_t) = memcpy;

void tc_memcpy_init(int verbose, int mmflags)
{
	const char * method = "libc";

#if defined(ARCH_X86) || defined(ARCH_X86_64)
	int accel = mmflags == -1 ? ac_mmflag() : mmflags;
#endif

#if defined(ARCH_X86)
	if((accel & MM_CMOVE) && (accel & MM_SSE))
	{
		method = "sse";
		tc_memcpy = ac_memcpy_sse;
	}
	else if(accel & MM_MMX)
	{
		method = "mmx";
		tc_memcpy = ac_memcpy_mmx;
	}
#endif

#if defined(ARCH_X86_64)
	if((accel & MM_CMOVE) && (accel & MM_SSE2))
	{
		method = "amd64";
		tc_memcpy = ac_memcpy_amd64;
	}
#endif

	if(verbose)
		fprintf(stderr, "tc_memcpy: using %s for memcpy\n", method);
}