;------------------------------------------------------------------------- ; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x. ; It handles any format in which there are three fields, the low ; order field being B and fully contained in the low order byte, the ; second field being G and being somewhere in bits 4 through 11, ; and the high order field being R and fully contained in the high ; order byte. ; ; The YUV12 input is planar, 8 bits per pel. The Y plane may have ; a pitch of up to 768. It may have a width less than or equal ; to the pitch. It must be DWORD aligned, and preferably QWORD ; aligned. Pitch and Width must be a multiple of four. For best ; performance, Pitch should not be 4 more than a multiple of 32. ; Height may be any amount, but must be a multiple of two. The U ; and V planes may have a different pitch than the Y plane, subject ; to the same limitations. ; ;include iammx.inc ;include locals.inc RGB_formats: dd RGB565 dd RGB555 dd RGB664 dd RGB655 Minusg: dd 00800080h, 00800080h Yadd: dd 10101010h, 10101010h VtR: dd 00660066h, 00660066h ;01990199h,01990199h VtG: dd 00340034h, 00340034h ;00d000d0h,00d000d0h UtG: dd 00190019h, 00190019h ;00640064h,00640064h UtB: dd 00810081h, 00810081h ;02050205h,02050205h Ymul: dd 004a004ah, 004a004ah ;012a012ah,012a012ah UVtG: dd 00340019h, 00340019h ;00d00064h,00d00064h VtRUtB: dd 01990205h, 01990205h fourbitu: dd 0f0f0f0f0h, 0f0f0f0f0h fivebitu: dd 0e0e0e0e0h, 0e0e0e0e0h sixbitu: dd 0c0c0c0c0h, 0c0c0c0c0h %assign LocalFrameSize 156 %assign RegisterStorageSize 16 ; Arguments: %assign YPlane LocalFrameSize + RegisterStorageSize + 4 %assign UPlane LocalFrameSize + RegisterStorageSize + 8 %assign VPlane LocalFrameSize + RegisterStorageSize + 12 %assign FrameWidth LocalFrameSize + RegisterStorageSize + 16 %assign FrameHeight LocalFrameSize + RegisterStorageSize + 20 %assign YPitch LocalFrameSize + RegisterStorageSize + 24 %assign ChromaPitch LocalFrameSize + RegisterStorageSize + 28 %assign AspectAdjustmentCount LocalFrameSize + RegisterStorageSize + 32 %assign ColorConvertedFrame LocalFrameSize + RegisterStorageSize + 36 %assign DCIOffset LocalFrameSize + RegisterStorageSize + 40 %assign CCOffsetToLine0 LocalFrameSize + RegisterStorageSize + 44 %assign CCOPitch LocalFrameSize + RegisterStorageSize + 48 %assign CCType LocalFrameSize + RegisterStorageSize + 52 %assign EndOfArgList LocalFrameSize + RegisterStorageSize + 56 ; Locals (on local stack frame) %assign CCOCursor 0 %assign CCOSkipDistance 4 %assign ChromaLineLen 8 %assign YCursor 12 %assign DistanceFromVToU 16 %assign EndOfChromaLine 20 %assign AspectCount 24 %assign AspectBaseCount 28 %assign tmpYCursorEven 32 %assign tmpYCursorOdd 36 %assign tmpCCOPitch 40 %assign temp_mmx 44 ; note it is 48 bytes %assign RLeftShift 92 %assign GLeftShift 100 %assign RRightShift 108 %assign GRightShift 116 %assign BRightShift 124 %assign RUpperLimit 132 %assign GUpperLimit 140 %assign BUpperLimit 148 ; extern void C MMX_YUV12ToRGB16 ( ; U8* YPlane, ; U8* UPlane, ; U8* VPlane, ; UN FrameWidth, ; UN FrameHeight, ; UN YPitch, ; UN VPitch, ; UN AspectAdjustmentCount, ; U8* ColorConvertedFrame, ; U32 DCIOffset, ; U32 CCOffsetToLine0, ; IN CCOPitch, ; IN CCType) ; ; The local variables are on the stack, ; The tables are in the one and only data segment. ; ; CCOffsetToLine0 is relative to ColorConvertedFrame. ; CCType used by RGB color convertors to determine the exact conversion type. ; RGB565 = 0 ; RGB555 = 1 ; RGB664 = 2 ; RGB655 = 3 global yuv_2_rgb yuv_2_rgb: push esi push edi push ebp push ebx sub esp, LocalFrameSize mov eax, [esp+CCType] cmp eax,4 jae near finish jmp [RGB_formats+eax*4] RGB555: xor eax, eax mov ebx, 2 ; 10-8 for byte shift mov [esp+RLeftShift], ebx mov [esp+RLeftShift+4], eax mov ebx, 5 mov [esp+GLeftShift], ebx mov [esp+GLeftShift+4], eax mov ebx, 9 mov [esp+RRightShift], ebx mov [esp+RRightShift+4], eax mov [esp+GRightShift], ebx mov [esp+GRightShift+4], eax mov [esp+BRightShift], ebx mov [esp+BRightShift+4], eax movq mm0, [fivebitu] movq [esp+RUpperLimit], mm0 movq [esp+GUpperLimit], mm0 movq [esp+BUpperLimit], mm0 jmp RGBEND RGB664: xor eax, eax mov ebx, 2 ; 8-6 mov [esp+RLeftShift], ebx mov [esp+RLeftShift+4], eax mov ebx, 4 mov [esp+GLeftShift], ebx mov [esp+GLeftShift+4], eax mov ebx, 8 mov [esp+RRightShift], ebx mov [esp+RRightShift+4], eax mov [esp+GRightShift], ebx mov [esp+GRightShift+4], eax mov ebx, 10 mov [esp+BRightShift], ebx mov [esp+BRightShift+4], eax movq mm0, [sixbitu] movq [esp+RUpperLimit], mm0 movq [esp+GUpperLimit], mm0 movq mm0, [fourbitu] movq [esp+BUpperLimit], mm0 jmp RGBEND RGB655: xor eax, eax mov ebx, 2 ; 8-6 mov [esp+RLeftShift], ebx mov [esp+RLeftShift+4], eax mov ebx, 5 mov [esp+GLeftShift], ebx mov [esp+GLeftShift+4], eax mov ebx, 8 mov [esp+RRightShift], ebx mov [esp+RRightShift+4], eax mov ebx, 9 mov [esp+GRightShift], ebx mov [esp+GRightShift+4], eax mov [esp+BRightShift], ebx mov [esp+BRightShift+4], eax movq mm0, [sixbitu] movq [esp+RUpperLimit], mm0 movq mm0, [fivebitu] movq [esp+GUpperLimit], mm0 movq [esp+BUpperLimit], mm0 jmp RGBEND RGB565: xor eax, eax mov ebx, 3 ; 8-5 mov [esp+RLeftShift], ebx mov [esp+RLeftShift+4], eax mov ebx, 5 mov [esp+GLeftShift], ebx mov [esp+GLeftShift+4], eax mov ebx, 9 mov [esp+RRightShift], ebx mov [esp+RRightShift+4], eax mov [esp+BRightShift], ebx mov [esp+BRightShift+4], eax mov ebx, 8 mov [esp+GRightShift], ebx mov [esp+GRightShift+4], eax movq mm0, [fivebitu] movq [esp+RUpperLimit], mm0 movq [esp+BUpperLimit], mm0 movq mm0, [sixbitu] movq [esp+GUpperLimit], mm0 ; jmp RGBEND RGBEND: mov ebx, [esp+VPlane] mov ecx, [esp+UPlane] sub ecx, ebx mov [esp+DistanceFromVToU], ecx mov eax, [esp+ColorConvertedFrame] add eax, [esp+DCIOffset] add eax, [esp+CCOffsetToLine0] mov [esp+CCOCursor], eax mov ecx,[esp+YPitch] mov ebx,[esp+FrameWidth] mov eax,[esp+CCOPitch] sub eax, ebx ; CCOPitch-FrameWidth sub eax, ebx ; CCOPitch-2*FrameWidth sar ebx, 1 ; FrameWidth/2 mov esi,[esp+YPlane] ; Fetch cursor over luma plane. mov [esp+ChromaLineLen],ebx ; FrameWidth/2 mov [esp+ CCOSkipDistance],eax ; CCOPitch-3*FrameWidth mov [esp+YCursor],esi mov edx,[esp+AspectAdjustmentCount] mov esi,[esp+VPlane] cmp edx,1 je near finish mov [esp+AspectCount],edx mov [esp+AspectBaseCount],edx xor eax, eax mov edi,[esp+ChromaLineLen] mov [esp+EndOfChromaLine],edi mov edi,[esp+CCOCursor] mov edx,[esp+DistanceFromVToU] mov ebp,[esp+YCursor] ; Fetch Y Pitch. mov ebx,[esp+FrameWidth] add ebp, ebx mov [esp+tmpYCursorEven],ebp mov eax,[esp+YPitch] add ebp, eax mov [esp+tmpYCursorOdd],ebp sar ebx, 1 add esi, ebx add edx, esi neg ebx mov [esp+FrameWidth],ebx ; Register Usage: ; ;------------------------------------------------------------------------------ PrepareChromaLine: mov ebp,[esp+AspectCount] mov ebx,[esp+FrameWidth] sub ebp,2 mov eax,[esp+CCOPitch] mov [esp+tmpCCOPitch],eax ja continue xor eax,eax add ebp,[esp+AspectAdjustmentCount] mov [esp+tmpCCOPitch],eax continue: mov [esp+AspectCount],ebp do_next_8x2_block: mov ebp,[esp+tmpYCursorEven] ; here is even line movd mm1, [edx+ebx] ; 4 u values pxor mm0, mm0 ; mm0=0 movd mm2, [esi+ebx] ; 4 v values punpcklbw mm1, mm0 ; get 4 unsign u psubw mm1, [Minusg] ; get 4 unsign u-128 punpcklbw mm2, mm0 ; get unsign v psubw mm2, [Minusg] ; get unsign v-128 movq mm3, mm1 ; save the u-128 unsign movq mm5, mm1 ; save u-128 unsign punpcklwd mm1, mm2 ; get 2 low u, v unsign pairs pmaddwd mm1, [UVtG] punpckhwd mm3, mm2 ; create high 2 unsign uv pairs pmaddwd mm3, [UVtG] movq [temp_mmx+esp], mm2 ; save v-128 movq mm6, [ebp+2*ebx] ; mm6 has 8 y pixels psubusb mm6, [Yadd] ; mm6 has 8 y-16 pixels packssdw mm1, mm3 ; packed the results to signed words movq mm7, mm6 ; save the 8 y-16 pixels punpcklbw mm6, mm0 ; mm6 has 4 low y-16 unsign pmullw mm6, [Ymul] punpckhbw mm7, mm0 ; mm7 has 4 high y-16 unsign pmullw mm7, [Ymul] movq mm4, mm1 movq [temp_mmx+esp+8], mm1 ; save 4 chroma G values punpcklwd mm1, mm1 ; chroma G replicate low 2 movq mm0, mm6 ; low y punpckhwd mm4, mm4 ; chroma G replicate high 2 movq mm3, mm7 ; high y psubw mm6, mm1 ; 4 low G psraw mm6, [esp+GRightShift] psubw mm7, mm4 ; 4 high G values in signed 16 bit movq mm2, mm5 punpcklwd mm5, mm5 ; replicate the 2 low u pixels pmullw mm5, [UtB] punpckhwd mm2, mm2 psraw mm7, [esp+GRightShift] pmullw mm2, [UtB] packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0 movq [temp_mmx+esp+16], mm5 ; low chroma B paddw mm5, mm0 ; 4 low B values in signed 16 bit movq [temp_mmx+esp+40], mm2 ; high chroma B paddw mm2, mm3 ; 4 high B values in signed 16 bit psraw mm5, [esp+BRightShift] ; low B scaled down by 6+(8-5) psraw mm2, [esp+BRightShift] ; high B scaled down by 6+(8-5) packuswb mm5, mm2 ; mm5: B7 B6 B5 B4 B3 B2 B1 B0 movq mm2, [temp_mmx+esp] ; 4 v values movq mm1, mm5 ; save B movq mm7, mm2 punpcklwd mm2, mm2 ; replicate the 2 low v pixels pmullw mm2, [VtR] punpckhwd mm7, mm7 pmullw mm7, [VtR] paddusb mm1, [esp+BUpperLimit] ; mm1: saturate B+0FF-15 movq [temp_mmx+esp+24], mm2 ; low chroma R paddw mm2, mm0 ; 4 low R values in signed 16 bit psraw mm2, [esp+RRightShift] ; low R scaled down by 6+(8-5) pxor mm4, mm4 ; mm4=0 for 8->16 conversion movq [temp_mmx+esp+32], mm7 ; high chroma R paddw mm7, mm3 ; 4 high R values in signed 16 bit psraw mm7, [esp+RRightShift] ; high R scaled down by 6+(8-5) psubusb mm1, [esp+BUpperLimit] packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0 paddusb mm6, [esp+GUpperLimit] ; G fast patch ih psubusb mm6, [esp+GUpperLimit] ; fast patch ih paddusb mm2, [esp+RUpperLimit] ; R psubusb mm2, [esp+RUpperLimit] ; here we are packing from RGB24 to RGB16 ; input: ; mm6: G7 G6 G5 G4 G3 G2 G1 G0 ; mm1: B7 B6 B5 B4 B3 B2 B1 B0 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0 ; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2 ; when H=2**xBITS-1 (x is for R G B) ; output: ; mm1- result: 4 low RGB16 ; mm7- result: 4 high RGB16 ; using: mm0- zero register ; mm3- temporary results ; algorithm: ; for (i=0; i<8; i++) { ; RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i]; ; } psllq mm2, [esp+RLeftShift] ; position R in the most significant part of the byte movq mm7, mm1 ; mm1: Save B ; note: no need for shift to place B on the least significant part of the byte ; R in left position, B in the right position so they can be combined punpcklbw mm1, mm2 ; mm1: 4 low 16 bit RB pxor mm0, mm0 ; mm0: 0 punpckhbw mm7, mm2 ; mm5: 4 high 16 bit RB movq mm3, mm6 ; mm3: G punpcklbw mm6, mm0 ; mm6: low 4 G 16 bit psllw mm6, [esp+GLeftShift] ; shift low G 5 positions punpckhbw mm3, mm0 ; mm3: high 4 G 16 bit por mm1, mm6 ; mm1: low RBG16 psllw mm3, [esp+GLeftShift] ; shift high G 5 positions por mm7, mm3 ; mm5: high RBG16 mov ebp,[esp+tmpYCursorOdd] ; moved to here to save cycles before odd line movq [edi], mm1 ; !! aligned ;- start odd line movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels pxor mm2, mm2 psubusb mm1, [Yadd] ; mm1 has 8 pixels y-16 movq mm5, mm1 punpcklbw mm1, mm2 ; get 4 low y-16 unsign pixels word pmullw mm1, [Ymul] ; low 4 luminance contribution punpckhbw mm5, mm2 ; 4 high y-16 pmullw mm5, [Ymul] ; high 4 luminance contribution movq [edi+8], mm7 ; !! aligned movq mm0, mm1 paddw mm0, [temp_mmx+esp+24] ; low 4 R movq mm6, mm5 psraw mm0, [esp+RRightShift] ; low R scaled down by 6+(8-5) paddw mm5, [temp_mmx+esp+32] ; high 4 R movq mm2, mm1 psraw mm5, [esp+RRightShift] ; high R scaled down by 6+(8-5) paddw mm2, [temp_mmx+esp+16] ; low 4 B packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0 psraw mm2, [esp+BRightShift] ; low B scaled down by 6+(8-5) movq mm5, mm6 paddw mm6, [temp_mmx+esp+40] ; high 4 B psraw mm6, [esp+BRightShift] ; high B scaled down by 6+(8-5) movq mm3, [temp_mmx+esp+8] ; chroma G low 4 packuswb mm2, mm6 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0 movq mm4, mm3 punpcklwd mm3, mm3 ; replicate low 2 punpckhwd mm4, mm4 ; replicate high 2 psubw mm1, mm3 ; 4 low G psraw mm1, [esp+GRightShift] ; low G scaled down by 6+(8-5) psubw mm5, mm4 ; 4 high G values in signed 16 bit psraw mm5, [esp+GRightShift] ; high G scaled down by 6+(8-5) paddusb mm2, [esp+BUpperLimit] ; mm1: saturate B+0FF-15 packuswb mm1, mm5 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0 psubusb mm2, [esp+BUpperLimit] paddusb mm1, [esp+GUpperLimit] ; G psubusb mm1, [esp+GUpperLimit] paddusb mm0, [esp+RUpperLimit] ; R mov eax,[esp+tmpCCOPitch] psubusb mm0, [esp+RUpperLimit] ; here we are packing from RGB24 to RGB16 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0 ; output: ; mm2- result: 4 low RGB16 ; mm7- result: 4 high RGB16 ; using: mm4- zero register ; mm3- temporary results psllq mm0, [esp+RLeftShift] ; position R in the most significant part of the byte movq mm7, mm2 ; mm7: Save B ; note: no need for shift to place B on the least significant part of the byte ; R in left position, B in the right position so they can be combined punpcklbw mm2, mm0 ; mm1: 4 low 16 bit RB pxor mm4, mm4 ; mm4: 0 movq mm3, mm1 ; mm3: G punpckhbw mm7, mm0 ; mm7: 4 high 16 bit RB punpcklbw mm1, mm4 ; mm1: low 4 G 16 bit punpckhbw mm3, mm4 ; mm3: high 4 G 16 bit psllw mm1, [esp+GLeftShift] ; shift low G 5 positions por mm2, mm1 ; mm2: low RBG16 psllw mm3, [esp+GLeftShift] ; shift high G 5 positions por mm7, mm3 ; mm7: high RBG16 movq [edi+eax], mm2 movq [edi+eax+8], mm7 ; aligned add edi, 16 ; ih take 16 bytes (8 pixels-16 bit) add ebx, 4 ; ? to take 4 pixels together instead of 2 jl near do_next_8x2_block ; ? update the loop for 8 y pixels at once add edi,[esp+CCOSkipDistance] ; go to begin of next line add edi,[esp+tmpCCOPitch] ; skip odd line (if it is needed) ; Leax AspectCount ; Lebp CCOPitch ; skip odd line ; sub eax, 2 ; jg @f ; Addeax AspectBaseCount ; xor ebp, ebp ;@@: ; Seax AspectCount ; add edi, ebp mov eax,[esp+YPitch] mov ebp,[esp+tmpYCursorOdd] add ebp, eax ; skip one line ; lea ebp, [ebp+2*eax] ; skip two lines mov [esp+tmpYCursorEven],ebp ; Sebp tmpYCursorOdd add ebp, eax ; skip one line mov [esp+tmpYCursorOdd],ebp ; Lebp tmpYCursorEven ; lea ebp, [ebp+2*eax] ; Sebp tmpYCursorEven add esi,[esp+ChromaPitch] add edx,[esp+ChromaPitch] ; Leax YLimit ; Done with last line? ; cmp ebp, eax ; jbe PrepareChromaLine sub word [esp+FrameHeight],2 ja near PrepareChromaLine ;------------------------------------------------------------------------------ finish: emms add esp, LocalFrameSize pop ebx pop ebp pop edi pop esi ret