Xorg: work on amd64 sse

ulab-next-nosound
Jay Sorg 10 years ago
parent 5c8b1de800
commit 5ccbc37882

@ -9,7 +9,7 @@ rdpComposite.o rdpGlyphs.o rdpPixmap.o rdpInput.o rdpClientCon.o rdpCapture.o \
rdpTrapezoids.o rdpXv.o
;OBJS += cpuid_x86.o i420_to_rgb32_x86_sse2.o yv12_to_rgb32_x86_sse2.o yuy2_to_rgb32_x86_sse2.o uyvy_to_rgb32_x86_sse2.o
;OBJS += i420_to_rgb32_amd64_sse2.o yv12_to_rgb32_amd64_sse2.o yuy2_to_rgb32_amd64_sse2.o uyvy_to_rgb32_amd64_sse2.o
;OBJS += cpuid_amd64.o i420_to_rgb32_amd64_sse2.o yv12_to_rgb32_amd64_sse2.o yuy2_to_rgb32_amd64_sse2.o uyvy_to_rgb32_amd64_sse2.o
CFLAGS = -g -O2 -Wall -fPIC -I/usr/include/xorg -I/usr/include/pixman-1 \
-I../../../common
@ -41,6 +41,9 @@ yuy2_to_rgb32_x86_sse2.o: x86/yuy2_to_rgb32_x86_sse2.asm
uyvy_to_rgb32_x86_sse2.o: x86/uyvy_to_rgb32_x86_sse2.asm
yasm -f elf32 -g dwarf2 x86/uyvy_to_rgb32_x86_sse2.asm
cpuid_amd64.o: amd64/cpuid_amd64.asm
yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm
i420_to_rgb32_amd64_sse2.o: amd64/i420_to_rgb32_amd64_sse2.asm
yasm -f elf64 -g dwarf2 amd64/i420_to_rgb32_amd64_sse2.asm

@ -0,0 +1,41 @@
SECTION .text
%macro PROC 1
align 16
global %1
%1:
%endmacro
;The first six integer or pointer arguments are passed in registers
;RDI, RSI, RDX, RCX, R8, and R9
;int
;cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
PROC cpuid_amd64
; save registers
push rbx
push rdx
push rcx
push r8
push r9
mov rax, rdi
mov rcx, rsi
cpuid
pop rdi
mov [rdi], edx
pop rdi
mov [rdi], ecx
pop rdi
mov [rdi], ebx
pop rdi
mov [rdi], eax
mov eax, 0
; restore registers
pop rbx
ret;
align 16

@ -0,0 +1,39 @@
/*
Copyright 2014 Jay Sorg
Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that
the above copyright notice appear in all copies and that both that
copyright notice and this permission notice appear in supporting
documentation.
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
amd64 asm files
*/
#ifndef __FUNCS_AMD64_H
#define __FUNCS_AMD64_H
int
cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx);
int
yv12_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
yuy2_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
uyvy_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
#endif

@ -9,10 +9,9 @@
;i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
PROC i420_to_rgb32_amd64_sse2
push ebx
mov eax, 0
pop ebx
push rbx
mov rax, 0
pop rbx
ret
align 16

@ -9,10 +9,9 @@
;uyvy_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
PROC uyvy_to_rgb32_amd64_sse2
push ebx
mov eax, 0
pop ebx
push rbx
mov rax, 0
pop rbx
ret
align 16

@ -9,10 +9,9 @@
;yuy2_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
PROC yuy2_to_rgb32_amd64_sse2
push ebx
mov eax, 0
pop ebx
push rbx
mov rax, 0
pop rbx
ret
align 16

@ -1,3 +1,47 @@
;
;Copyright 2014 Jay Sorg
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
;the above copyright notice appear in all copies and that both that
;copyright notice and this permission notice appear in supporting
;documentation.
;
;The above copyright notice and this permission notice shall be included in
;all copies or substantial portions of the Software.
;
;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;
;YV12 to RGB32
;amd64 SSE2 32 bit
;
; RGB to YUV
; 0.299 0.587 0.114
; -0.14713 -0.28886 0.436
; 0.615 -0.51499 -0.10001
; YUV to RGB
; 1 0 1.13983
; 1 -0.39465 -0.58060
; 1 2.03211 0
; shift left 12
; 4096 0 4669
; 4096 -1616 -2378
; 4096 9324 0
SECTION .data
align 16
c128 times 8 dw 128
c4669 times 8 dw 4669
c1616 times 8 dw 1616
c2378 times 8 dw 2378
c9324 times 8 dw 9324
SECTION .text
%macro PROC 1
align 16
@ -5,14 +49,199 @@
%1:
%endmacro
do8_uv:
; u
movd xmm1, [rbx] ; 4 at a time
lea rbx, [rbx + 4]
punpcklbw xmm1, xmm1
pxor xmm6, xmm6
punpcklbw xmm1, xmm6
movdqa xmm7, [rel c128]
psubw xmm1, xmm7
psllw xmm1, 4
; v
movd xmm2, [rdx] ; 4 at a time
lea rdx, [rdx + 4]
punpcklbw xmm2, xmm2
punpcklbw xmm2, xmm6
psubw xmm2, xmm7
psllw xmm2, 4
do8:
; y
movq xmm0, [rsi] ; 8 at a time
lea rsi, [rsi + 8]
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
; r = y + hiword(4669 * (v << 4))
movdqa xmm4, [rel c4669]
pmulhw xmm4, xmm2
movdqa xmm3, xmm0
paddw xmm3, xmm4
; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
movdqa xmm5, [rel c1616]
pmulhw xmm5, xmm1
movdqa xmm6, [rel c2378]
pmulhw xmm6, xmm2
movdqa xmm4, xmm0
psubw xmm4, xmm5
psubw xmm4, xmm6
; b = y + hiword(9324 * (u << 4))
movdqa xmm6, [rel c9324]
pmulhw xmm6, xmm1
movdqa xmm5, xmm0
paddw xmm5, xmm6
packuswb xmm3, xmm3 ; b
packuswb xmm4, xmm4 ; g
punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a
packuswb xmm5, xmm5 ; r
punpcklbw xmm5, xmm4 ; ar
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5 ; argb
movdqa [rdi], xmm3
lea rdi, [rdi + 16]
punpckhwd xmm4, xmm5 ; argb
movdqa [rdi], xmm4
lea rdi, [rdi + 16]
ret;
;The first six integer or pointer arguments are passed in registers
; RDI, RSI, RDX, RCX, R8, and R9
;int
;yv12_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
PROC yv12_to_rgb32_amd64_sse2
push ebx
push rbx
push rsi
push rdi
push rbp
push rdi
mov rdi, rcx ; rgbs
mov rcx, rsi ; width
mov rdx, rcx
mov rbp, rdx ; height
mov rax, rbp
shr rbp, 1
imul rax, rcx ; rax = width * height
pop rsi ; y
mov rbx, rsi ; u = y + width * height
add rbx, rax
; local vars
; char* yptr1
; char* yptr2
; char* uptr
; char* vptr
; int* rgbs1
; int* rgbs2
; int width
sub rsp, 56 ; local vars, 56 bytes
mov [rsp + 0], rsi ; save y1
add rsi, rdx
mov [rsp + 8], rsi ; save y2
mov [rsp + 16], rbx ; save u
shr rax, 2
add rbx, rax ; v = u + (width * height / 4)
mov [rsp + 24], rbx ; save v
mov [rsp + 32], rdi ; save rgbs1
mov rax, rdx
shl rax, 2
add rdi, rax
mov [rsp + 40], rdi ; save rgbs2
mov eax, 0
pop ebx
loop_y:
mov rcx, rdx ; width
shr rcx, 3
; save rdx
mov [rsp + 48], rdx
prefetchnta 4096[rsp + 0] ; y
prefetchnta 1024[rsp + 16] ; u
prefetchnta 1024[rsp + 24] ; v
loop_x:
mov rsi, [rsp + 0] ; y1
mov rbx, [rsp + 16] ; u
mov rdx, [rsp + 24] ; v
mov rdi, [rsp + 32] ; rgbs1
; y1
call do8_uv
mov [rsp + 0], rsi ; y1
mov [rsp + 32], rdi ; rgbs1
mov rsi, [rsp + 8] ; y2
mov rdi, [rsp + 40] ; rgbs2
; y2
call do8
mov [rsp + 8], rsi ; y2
mov [rsp + 16], rbx ; u
mov [rsp + 24], rdx ; v
mov [rsp + 40], rdi ; rgbs2
dec rcx ; width
jnz loop_x
; restore rdx
mov rdx, [rsp + 48]
; update y1 and 2
mov rax, [rsp + 0]
mov rbx, rdx
add rax, rbx
mov [rsp + 0], rax
mov rax, [rsp + 8]
add rax, rbx
mov [rsp + 8], rax
; update rgb1 and 2
mov rax, [rsp + 32]
mov rbx, rdx
shl rbx, 2
add rax, rbx
mov [rsp + 32], rax
mov rax, [rsp + 40]
add rax, rbx
mov [rsp + 40], rax
mov rcx, rbp
dec rcx ; height
mov rbp, rcx
jnz loop_y
add rsp, 56
mov rax, 0
pop rbp
pop rdi
pop rsi
pop rbx
ret
align 16

@ -417,6 +417,7 @@ stretch_RGB32_RGB32(int *src, int src_width, int src_height,
iv += ov;
}
LLOGLN(10, ("stretch_RGB32_RGB32: out"));
return 0;
}
@ -642,14 +643,7 @@ xrdpVidQueryImageAttributes(ScrnInfoPtr pScrn, int id,
#if XV_USE_ACCEL
#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64)
int
yv12_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
yuy2_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
int
uyvy_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
#include "amd64/funcs_amd64.h"
#elif defined(__x86__) || defined(_M_IX86) || defined(__i386__)
#include "x86/funcs_x86.h"
#endif
@ -713,23 +707,38 @@ rdpXvInit(ScreenPtr pScreen, ScrnInfoPtr pScrn)
if (g_xv_use_accel)
{
#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64)
dev->yv12_to_rgb32 = yv12_to_rgb32_amd64_sse2;
dev->i420_to_rgb32 = i420_to_rgb32_amd64_sse2;
dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2;
dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2;
LLOGLN(0, ("rdpXvInit: sse amd64 yuv functions assigned"));
int ax, bx, cx, dx;
cpuid_amd64(1, 0, &ax, &bx, &cx, &dx);
LLOGLN(0, ("rdpXvInit: cpuid ax 1 cx 0 return ax 0x%8.8x bx "
"0x%8.8x cx 0x%8.8x dx 0x%8.8x", ax, bx, cx, dx));
if (dx & (1 << 26)) /* SSE 2 */
{
dev->yv12_to_rgb32 = yv12_to_rgb32_amd64_sse2;
dev->i420_to_rgb32 = i420_to_rgb32_amd64_sse2;
dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2;
dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2;
LLOGLN(0, ("rdpXvInit: sse2 amd64 yuv functions assigned"));
}
else
{
dev->yv12_to_rgb32 = YV12_to_RGB32;
dev->i420_to_rgb32 = I420_to_RGB32;
dev->yuy2_to_rgb32 = YUY2_to_RGB32;
dev->uyvy_to_rgb32 = UYVY_to_RGB32;
LLOGLN(0, ("rdpXvInit: warning, c yuv functions assigned"));
}
#elif defined(__x86__) || defined(_M_IX86) || defined(__i386__)
int ax, bx, cx, dx;
cpuid_x86(1, 0, &ax, &bx, &cx, &dx);
LLOGLN(0, ("rdpXvInit: cpuid eax 1 ecx 0 return eax 0x%8.8x ebx "
"0x%8.8x ecx 0x%8.8x edx 0x%8.8x", ax, bx, cx, dx));
LLOGLN(0, ("rdpXvInit: cpuid ax 1 cx 0 return ax 0x%8.8x bx "
"0x%8.8x cx 0x%8.8x dx 0x%8.8x", ax, bx, cx, dx));
if (dx & (1 << 26)) /* SSE 2 */
{
dev->yv12_to_rgb32 = yv12_to_rgb32_x86_sse2;
dev->i420_to_rgb32 = i420_to_rgb32_x86_sse2;
dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2;
dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2;
LLOGLN(0, ("rdpXvInit: sse x86 yuv functions assigned"));
LLOGLN(0, ("rdpXvInit: sse2 x86 yuv functions assigned"));
}
else
{

@ -140,12 +140,12 @@ PROC yv12_to_rgb32_x86_sse2
add ebx, eax
; local vars
; char* yptr1;
; char* yptr2;
; char* uptr;
; char* vptr;
; int* rgbs1;
; int* rgbs2;
; char* yptr1
; char* yptr2
; char* uptr
; char* vptr
; int* rgbs1
; int* rgbs2
; int width
sub esp, 28 ; local vars, 28 bytes
mov [esp + 0], esi ; save y1

Loading…
Cancel
Save