#define float2fixed14(x)  ((int) ((x) * 16384 + 0.5f))

static void YCbCr_to_RGB_row_mmx(uint8 *out, uint8 *y, uint8 *pcb, uint8 *pcr, int count, int step)
{
	static const int16 __declspec(align(8)) bias1[4] = { -0x8000, -0x8000, -0x8000, -0x8000 };
	static const int16 __declspec(align(8)) bias2[4] = { 32, 32, 32, 32 };
	static const int16 __declspec(align(8)) scb0[4]	= { float2fixed14(-0.34414f), float2fixed14(-0.34414f), float2fixed14(-0.34414f), float2fixed14(-0.34414f) };
	static const int16 __declspec(align(8)) scb1[4]	= { float2fixed14( 1.77200f), float2fixed14( 1.77200f), float2fixed14( 1.77200f), float2fixed14( 1.77200f) };
	static const int16 __declspec(align(8)) scr0[4]	= { float2fixed14( 1.40200f), float2fixed14( 1.40200f), float2fixed14( 1.40200f), float2fixed14( 1.40200f) };
	static const int16 __declspec(align(8)) scr1[4]	= { float2fixed14(-0.71414f), float2fixed14(-0.71414f), float2fixed14(-0.71414f), float2fixed14(-0.71414f) };

	__asm
	{
		mov			edi, [out];
		mov			esi, [y];
		mov			eax, [pcb];
		mov			ebx, [pcr];
		mov			ecx, [count];
		mov			edx, [step];

ycbcr_main:
		pxor		mm0, mm0;
		pxor		mm1, mm1;
		pxor		mm2, mm2;
		movq		mm3, [bias1];
		punpcklbw	mm0, [esi];	// 4xY (shifted left by 8)
		punpcklbw	mm1, [eax];	// 4xCb (shifted left by 8)
		punpcklbw	mm2, [ebx];	// 4xCr (shifted left by 8)
		paddw		mm1, mm3;
		paddw		mm2, mm3;
		movq		mm4, [scb1];
		movq		mm5, [scr1];
		psrlw		mm0, 2;
		pmulhw		mm4, mm1;
		pmulhw		mm5, mm2;
		paddw		mm0, [bias2];
		pmulhw		mm1, [scb0];
		pmulhw		mm2, [scr0];
		paddw		mm4, mm0;	// y+cb*scb1 (b)
		paddw		mm5, mm0;	// y+cr*scr1
		paddw		mm2, mm0;	// y+cr*scr0 (r)
		pcmpeqb		mm3, mm3;	// dummy alpha
		paddw		mm5, mm1;	// y+cr*scr1+cb*scb0 (g)

		// shift down
		psraw		mm4, 6;
		psraw		mm2, 6;
		psraw		mm5, 6;

		// transpose to interleave r,g,b,alpha
		movq		mm0, mm2;
		punpcklwd	mm2, mm5;	// (r0,g0,r1,g1)
		punpckhwd	mm0, mm5;	// (r2,g2,r3,g3)
		movq		mm1, mm4;
		punpcklwd	mm4, mm3;	// (b0,x0,b1,x1)
		punpckhwd	mm1, mm3;	// (b2,x2,b3,x3)

		movq		mm3, mm2;
		punpckldq	mm2, mm4;	// (r0,g0,b0,x0)
		punpckhdq	mm3, mm4;	// (r1,g1,b1,x1)
		movq		mm5, mm0;
		punpckldq	mm0, mm1;	// (r2,g2,b2,x2)
		punpckhdq	mm5, mm1;	// (r3,g3,b3,x3)

		// pack to 8 bits
		packuswb	mm2, mm2;	// pix0
		packuswb	mm3, mm3;	// pix1
		packuswb	mm0, mm0;	// pix2
		packuswb	mm5, mm5;	// pix3

		// write
		sub			ecx, 4;
		js			ycbcr_tail;

		movd		[edi], mm2;
		movd		[edi+edx], mm3;
		lea			edi, [edi+2*edx];
		movd		[edi], mm0;
		movd		[edi+edx], mm5;
		lea			edi, [edi+2*edx];
		jz			ycbcr_end; // result from sub
		add			esi, 4;
		add			eax, 4;
		add			ebx, 4;
		jmp			short ycbcr_main;

ycbcr_tail:
		add			ecx, 3;
		movd		[edi], mm2;
		jz			ycbcr_end;
		movd		[edi+edx], mm3;
		dec			ecx;
		jz			ycbcr_end;
		movd		[edi+2*edx], mm0;

ycbcr_end:
		emms;
	}
}

