You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
228 lines
5.3 KiB
228 lines
5.3 KiB
/*
|
|
* scale_line_22_yuv_mmx.S -- scale line in YUY2 format
|
|
* Copyright (C) 2003-2004 Ushodaya Enterprises Limited
|
|
* Author: Dan Dennedy <dan@dennedy.org>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
.file "scale_line_22_yuv_mmx.S"
|
|
.version "01.01"
|
|
|
|
.extern printf
|
|
|
|
gcc2_compiled.:
|
|
.data
|
|
MSG: .ascii "scale_line_22_yuv_mmx: %d %d\n"
|
|
|
|
.text
|
|
.align 16
|
|
|
|
#if !defined(__MINGW32__) && !defined(__CYGWIN__)
|
|
|
|
.globl pixops_scale_line_22_yuv_mmx
|
|
.type pixops_scale_line_22_yuv_mmx,@function
|
|
pixops_scale_line_22_yuv_mmx:
|
|
|
|
#else
|
|
|
|
.globl _pixops_scale_line_22_yuv_mmx
|
|
_pixops_scale_line_22_yuv_mmx:
|
|
|
|
#endif
|
|
/*
|
|
* Arguments
|
|
*
|
|
* weights: 8(%ebp)
|
|
* p (dest): 12(%ebp) %esi
|
|
* q1 (src0): 16(%ebp)
|
|
* q2 (src1): 20(%ebp)
|
|
* xstep: 24(%ebp)
|
|
* p_end: 28(%ebp)
|
|
* xinit: 32(%ebp)
|
|
* dest_x: 36(%ebp)
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Function call entry
|
|
*/
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
subl $28,%esp
|
|
pushl %edi
|
|
pushl %esi
|
|
pushl %ebx
|
|
/* Locals:
|
|
* int x %ebx
|
|
* int x_scaled -24(%ebp)
|
|
* int dest_x 36(%ebp)
|
|
*/
|
|
|
|
/*
|
|
* Setup
|
|
*/
|
|
/* Initialize variables */
|
|
movl 36(%ebp),%eax # destx
|
|
movl %eax,36(%ebp)
|
|
movl 32(%ebp),%ebx # x
|
|
movl 12(%ebp),%esi # dest
|
|
|
|
cmpl 28(%ebp),%esi # dest == dest_end ?
|
|
jnb .out
|
|
|
|
/* For the body of this loop, %mm0, %mm1, %mm2, %mm3 hold the 4 adjoining
|
|
* points we are interpolating between, as:
|
|
*
|
|
* 00VV00Y200UU00Y1
|
|
*/
|
|
|
|
pxor %mm4, %mm4
|
|
/*
|
|
* Load next component values into mm1 (src0) and mm3 (src1)
|
|
*/
|
|
movl %ebx, %eax # x_scaled
|
|
sarl $15, %eax
|
|
andl $0xfffffffe, %eax
|
|
movl %eax, %edx # x_aligned
|
|
andl $0xfffffffc, %edx
|
|
|
|
movl 16(%ebp), %edi # get src0
|
|
movl (%edi,%eax), %ecx # get y
|
|
andl $0x00ff00ff, %ecx # mask off y
|
|
movl (%edi,%edx), %eax # get uv
|
|
andl $0xff00ff00, %eax # mask off uv
|
|
orl %eax, %ecx # composite y, uv
|
|
movd %ecx, %mm1 # move to mmx1
|
|
punpcklbw %mm4, %mm1
|
|
|
|
movl 20(%ebp), %edi # get src1
|
|
movl (%edi,%edx), %ecx # get y
|
|
andl $0x00ff00ff, %ecx # mask off y
|
|
movl (%edi,%edx), %eax # get uv
|
|
andl $0xff00ff00, %eax # mask off uv
|
|
orl %eax, %ecx # composite y, uv
|
|
movd %ecx, %mm3 # move to mmx3
|
|
punpcklbw %mm4, %mm3
|
|
|
|
jmp .newx
|
|
|
|
.p2align 4,,7
|
|
.loop:
|
|
|
|
/* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
|
|
* 16 4 0xf 2 2
|
|
*/
|
|
movl 8(%ebp), %edi # get weights pointer
|
|
movl %ebx, %eax
|
|
andl $0xf000, %eax
|
|
shrl $7, %eax
|
|
|
|
/* At this point, %edi holds weights. Load the 4 weights into
|
|
* %mm4,%mm5,%mm6,%mm7, multiply and accumulate.
|
|
*/
|
|
movq (%edi,%eax), %mm4
|
|
pmullw %mm0, %mm4
|
|
movq 8(%edi,%eax), %mm5
|
|
pmullw %mm1, %mm5
|
|
movq 16(%edi,%eax), %mm6
|
|
pmullw %mm2,%mm6
|
|
movq 24(%edi,%eax), %mm7
|
|
pmullw %mm3,%mm7
|
|
|
|
paddw %mm4, %mm5
|
|
paddw %mm6, %mm7
|
|
paddw %mm5, %mm7
|
|
|
|
/* %mm7 holds the accumulated sum. Compute (C + 0x80) / 256
|
|
*/
|
|
pxor %mm4, %mm4
|
|
movl $0x80808080, %eax
|
|
movd %eax, %mm6
|
|
punpcklbw %mm4, %mm6
|
|
paddw %mm6, %mm7
|
|
psrlw $8, %mm7
|
|
|
|
/* Pack into %eax and store result
|
|
*/
|
|
packuswb %mm7, %mm7
|
|
movd %mm7, %eax
|
|
|
|
movb %al, (%esi) # *dest = y
|
|
|
|
movl 36(%ebp), %ecx # get dest_x
|
|
andl $1, %ecx # select u or v
|
|
sall $1, %ecx # determine offset
|
|
addl $1, %ecx # relative to x_aligned
|
|
sall $3, %ecx # offset * 8 bits/byte
|
|
|
|
movd %mm7, %eax
|
|
shrl %cl, %eax
|
|
movb %al, 1(%esi) # *dest = uv
|
|
|
|
addl $2, %esi # dest += 2
|
|
cmpl %esi,28(%ebp) # if dest == dest_end
|
|
je .out # then exit
|
|
|
|
addl $1, 36(%ebp) # dest_x++
|
|
|
|
.newx:
|
|
|
|
addl 24(%ebp), %ebx # x += x_step
|
|
/*
|
|
* Load current component values into mm0 (src0) and mm2 (src1)
|
|
*/
|
|
movq %mm1, %mm0
|
|
movq %mm3, %mm2
|
|
|
|
/*
|
|
* Load next component values into mm1 (src0) and mm3 (src1)
|
|
*/
|
|
movl %ebx, %eax # x_scaled
|
|
sarl $15, %eax
|
|
andl $0xfffffffe, %eax
|
|
movl %eax, %edx # x_aligned
|
|
andl $0xfffffffc, %edx
|
|
|
|
movl 16(%ebp), %edi # get src0
|
|
movl (%edi,%eax), %ecx # get y
|
|
andl $0x00ff00ff, %ecx # mask off y
|
|
movl (%edi,%edx), %eax # get uv
|
|
andl $0xff00ff00, %eax # mask off uv
|
|
orl %eax, %ecx # composite y, uv
|
|
movd %ecx, %mm1 # move to mmx1
|
|
punpcklbw %mm4, %mm1
|
|
|
|
movl 20(%ebp), %edi # get src1
|
|
movl (%edi,%edx), %ecx # get y
|
|
andl $0x00ff00ff, %ecx # mask off y
|
|
movl (%edi,%edx), %eax # get uv
|
|
andl $0xff00ff00, %eax # mask off uv
|
|
orl %eax, %ecx # composite y, uv
|
|
movd %ecx, %mm3 # move to mmx3
|
|
punpcklbw %mm4, %mm3
|
|
|
|
jmp .loop
|
|
|
|
.out:
|
|
movl %esi,%eax
|
|
emms
|
|
leal -40(%ebp),%esp
|
|
popl %ebx
|
|
popl %esi
|
|
popl %edi
|
|
movl %ebp,%esp
|
|
popl %ebp
|
|
ret
|