diff --git a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm index 2d0533ea..85c6170d 100644 --- a/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm +++ b/xorg/server/module/x86/yv12_to_rgb32_x86_sse2.asm @@ -1,10 +1,260 @@ +SECTION .data +align 16 +c16 times 4 dd 16 +c100 times 4 dd 100 +c128 times 4 dd 128 +c208 times 4 dd 208 +c255 times 4 dd 255 +c298 times 4 dd 298 +c409 times 4 dd 409 +c516 times 4 dd 516 + +SECTION .text + %macro PROC 1 align 16 global %1 %1: %endmacro +y1_do4: + ; y + mov eax, 0 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 0 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 1 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 2 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 3 + movdqa xmm7, [c16] + psubd xmm0, xmm7 + + ; u + mov eax, 0 + mov al, [ebx] + add ebx, 1 + pinsrd xmm1, eax, 0 + pinsrd xmm1, eax, 1 + mov al, [ebx] + add ebx, 1 + pinsrd xmm1, eax, 2 + pinsrd xmm1, eax, 3 + movdqa xmm7, [c128] + psubd xmm1, xmm7 + + ; v + mov eax, 0 + mov al, [edx] + add edx, 1 + pinsrd xmm2, eax, 0 + pinsrd xmm2, eax, 1 + mov al, [edx] + add edx, 1 + pinsrd xmm2, eax, 2 + pinsrd xmm2, eax, 3 + psubd xmm2, xmm7 + + ; t = (298 * c + 409 * e + 128) >> 8; + movdqa xmm3, [c298] + pmulld xmm3, xmm0 + movdqa xmm4, [c409] + pmulld xmm4, xmm2 + paddd xmm3, xmm4 + paddd xmm3, xmm7 + psrad xmm3, 8 + ; b = RDPCLAMP(t, 0, 255); + pxor xmm4, xmm4 + pmaxsd xmm3, xmm4 + movdqa xmm4, [c255] + pminsd xmm3, xmm4 + + ; t = (298 * c - 100 * d - 208 * e + 128) >> 8; + movdqa xmm4, [c298] + pmulld xmm4, xmm0 + movdqa xmm5, [c100] + pmulld xmm5, xmm1 + movdqa xmm6, [c208] + pmulld xmm6, xmm2 + psubd xmm4, xmm5 + psubd xmm4, xmm6 + paddd xmm4, xmm7 + psrad xmm4, 8 + ; g = RDPCLAMP(t, 0, 255); + pxor xmm5, xmm5 + pmaxsd xmm4, xmm5 + movdqa xmm5, [c255] + pminsd xmm4, xmm5 + + ; t = (298 * c + 516 * d + 128) >> 8; + movdqa xmm5, [c298] + pmulld xmm5, xmm0 + movdqa xmm6, [c516] + pmulld xmm6, xmm1 + paddd xmm5, xmm6 + paddd xmm5, xmm7 + psrad xmm5, 8 + ; r = RDPCLAMP(t, 0, 255); + pxor xmm6, xmm6 + pmaxsd xmm5, xmm6 + movdqa xmm6, [c255] + pminsd xmm5, xmm6 + + pextrd eax, xmm3, 0 + mov [edi], al + pextrd eax, xmm4, 0 + mov [edi + 1], al + pextrd eax, xmm5, 0 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 1 + mov [edi], al + pextrd eax, xmm4, 1 + mov [edi + 1], al + pextrd eax, xmm5, 1 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 2 + mov [edi], al + pextrd eax, xmm4, 2 + mov [edi + 1], al + pextrd eax, xmm5, 2 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 3 + mov [edi], al + pextrd eax, xmm4, 3 + mov [edi + 1], al + pextrd eax, xmm5, 3 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + ret; + +y2_do4: + ; y + mov eax, 0 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 0 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 1 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 2 + mov al, [esi] + add esi, 1 + pinsrd xmm0, eax, 3 + movdqa xmm7, [c16] + psubd xmm0, xmm7 + + movdqa xmm7, [c128] + + ; t = (298 * c + 409 * e + 128) >> 8; + movdqa xmm3, [c298] + pmulld xmm3, xmm0 + movdqa xmm4, [c409] + pmulld xmm4, xmm2 + paddd xmm3, xmm4 + paddd xmm3, xmm7 + psrad xmm3, 8 + ; b = RDPCLAMP(t, 0, 255); + pxor xmm4, xmm4 + pmaxsd xmm3, xmm4 + movdqa xmm4, [c255] + pminsd xmm3, xmm4 + + ; t = (298 * c - 100 * d - 208 * e + 128) >> 8; + movdqa xmm4, [c298] + pmulld xmm4, xmm0 + movdqa xmm5, [c100] + pmulld xmm5, xmm1 + movdqa xmm6, [c208] + pmulld xmm6, xmm2 + psubd xmm4, xmm5 + psubd xmm4, xmm6 + paddd xmm4, xmm7 + psrad xmm4, 8 + ; g = RDPCLAMP(t, 0, 255); + pxor xmm5, xmm5 + pmaxsd xmm4, xmm5 + movdqa xmm5, [c255] + pminsd xmm4, xmm5 + + ; t = (298 * c + 516 * d + 128) >> 8; + movdqa xmm5, [c298] + pmulld xmm5, xmm0 + movdqa xmm6, [c516] + pmulld xmm6, xmm1 + paddd xmm5, xmm6 + paddd xmm5, xmm7 + psrad xmm5, 8 + ; r = RDPCLAMP(t, 0, 255); + pxor xmm6, xmm6 + pmaxsd xmm5, xmm6 + movdqa xmm6, [c255] + pminsd xmm5, xmm6 + + pextrd eax, xmm3, 0 + mov [edi], al + pextrd eax, xmm4, 0 + mov [edi + 1], al + pextrd eax, xmm5, 0 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 1 + mov [edi], al + pextrd eax, xmm4, 1 + mov [edi + 1], al + pextrd eax, xmm5, 1 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 2 + mov [edi], al + pextrd eax, xmm4, 2 + mov [edi + 1], al + pextrd eax, xmm5, 2 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + pextrd eax, xmm3, 3 + mov [edi], al + pextrd eax, xmm4, 3 + mov [edi + 1], al + pextrd eax, xmm5, 3 + mov [edi + 2], al + mov eax, 0 + mov [edi + 3], al + add edi, 4 + + ret; + ;int ;yv12_to_rgb32_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs) @@ -12,11 +262,117 @@ PROC yv12_to_rgb32_x86_sse2 push ebx push esi push edi + push ebp + + mov edi, [esp + 32] ; rgbs + + mov ecx, [esp + 24] ; width + mov edx, ecx + mov ebp, [esp + 28] ; height + mov eax, ebp + shr ebp, 1 + imul eax, ecx ; eax = width * height + + mov esi, [esp + 20] ; y + + mov ebx, esi ; u = y + width * height + add ebx, eax + + ; local vars + ; char* yptr1; + ; char* yptr2; + ; char* uptr; + ; char* vptr; + ; int* rgbs1; + ; int* rgbs2; + ; int width + sub esp, 28 ; local vars, 28 bytes + mov [esp + 0], esi ; save y1 + add esi, edx + mov [esp + 4], esi ; save y2 + mov [esp + 8], ebx ; save u + shr eax, 2 + add ebx, eax ; v = u + (width * height / 4) + mov [esp + 12], ebx ; save v + + mov [esp + 16], edi ; save rgbs1 + mov eax, edx + shl eax, 2 + add edi, eax + mov [esp + 20], edi ; save rgbs2 + +loop_y: + + mov ecx, edx ; width + shr ecx, 2 + + ; save edx + mov [esp + 24], edx + +loop_x: + + mov esi, [esp + 0] ; y1 + mov ebx, [esp + 8] ; u + mov edx, [esp + 12] ; v + mov edi, [esp + 16] ; rgbs1 + + ; y1 + call y1_do4 + + mov [esp + 0], esi ; y1 + mov [esp + 16], edi ; rgbs1 + + mov esi, [esp + 4] ; y2 + mov edi, [esp + 20] ; rgbs2 + + ; y2 + call y2_do4 + + mov [esp + 4], esi ; y2 + mov [esp + 8], ebx ; u + mov [esp + 12], edx ; v + mov [esp + 20], edi ; rgbs2 + + dec ecx ; width + jnz loop_x + + ; restore edx + mov edx, [esp + 24] + + ; update y1 and 2 + mov eax, [esp + 0] + mov ebx, edx + add eax, ebx + mov [esp + 0], eax + + mov eax, [esp + 4] + add eax, ebx + mov [esp + 4], eax + + ; update rgb1 and 2 + mov eax, [esp + 16] + mov ebx, edx + shl ebx, 2 + add eax, ebx + mov [esp + 16], eax + + mov eax, [esp + 20] + add eax, ebx + mov [esp + 20], eax + + mov ecx, ebp + dec ecx ; height + mov ebp, ecx + jnz loop_y + + add esp, 28 mov eax, 0 + pop ebp pop edi pop esi pop ebx ret align 16 +