// ----------------- original code using intrinsics ----------------- static __forceinline __m64 LoadPixel(const Pixel* p) { return _mm_unpacklo_pi8(_mm_cvtsi32_si64(p->v), _mm_setzero_si64()); } static __forceinline __m64 AddLowAndHigh(__m64 a, __m64 b) { __m64 lh = _mm_unpacklo_pi32(a, b); // low halves __m64 hh = _mm_unpackhi_pi32(a, b); // high halves return _mm_add_pi32(lh, hh); } static __forceinline __m64 DoubleDot(__m64 a, __m64 b, __m64 x) { return AddLowAndHigh(_mm_madd_pi16(a, x), _mm_madd_pi16(b, x)); } static __forceinline __m64 AverageInts(__m64 a, __m64 b) { return _mm_srai_pi32(_mm_add_pi32(a, b), 1); } static void MatchColorsBlockIntrin(const Pixel *block, const Pixel *color, sU32 &blockMask) { static const __m64 maskc0 = { 0xfffffff8fffffffe }; static const __m64 maskh = { 0x0000000400000001 }; static const __m64 maskc3 = { 0x0000000800000002 }; __m64 col0 = LoadPixel(color+0); __m64 col1 = LoadPixel(color+1); __m64 dir = _mm_sub_pi16(col0, col1); __m64 stop01 = DoubleDot(col0, col1, dir); __m64 col2 = LoadPixel(color+2); __m64 col3 = LoadPixel(color+3); __m64 stop23 = DoubleDot(col2, col3, dir); __m64 c3c0p = AverageInts(stop01, stop23); __m64 halfp = AverageInts(_mm_unpackhi_pi32(stop23, stop23), stop23); __m64 c0c = _mm_unpackhi_pi32(c3c0p, c3c0p); __m64 c3c = _mm_unpacklo_pi32(c3c0p, c3c0p); __m64 halfc = _mm_unpacklo_pi32(halfp, halfp); __m64 mask = _mm_setzero_si64(); for(sInt i=14;i>=0;i-=2) { __m64 pix0 = LoadPixel(block + i + 0); __m64 pix1 = LoadPixel(block + i + 1); __m64 dots = DoubleDot(pix0, pix1, dir); mask = _mm_slli_pi32(mask, 4); mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(c0c, dots), maskc0)); mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(halfc, dots), maskh)); mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(c3c, dots), maskc3)); } mask = _mm_or_si64(mask, _mm_unpackhi_pi32(mask, mask)); blockMask = _mm_cvtsi64_si32(mask); _mm_empty(); } // ----------------- generated code ----------------- ; COMDAT ?MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z _TEXT SEGMENT _halfc$ = -16 ; size = 8 _c3c$ = -8 ; size = 8 _blockMask$ = 8 ; size = 4 ?MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z PROC ; MatchColorsBlockIntrin, COMDAT ; _block$ = ecx ; _color$ = eax ; 212 : { push ebp mov ebp, esp and esp, -8 ; fffffff8H sub esp, 16 ; 00000010H ; 213 : static const __m64 maskc0 = { 0xfffffff8fffffffe }; ; 214 : static const __m64 maskh = { 0x0000000400000001 }; ; 215 : static const __m64 maskc3 = { 0x0000000800000002 }; ; 216 : ; 217 : __m64 col0 = LoadPixel(color+0); mov edx, DWORD PTR [eax] pxor mm0, mm0 movd mm1, edx ; 218 : __m64 col1 = LoadPixel(color+1); mov edx, DWORD PTR [eax+4] movq mm2, mm1 movq mm1, mm0 punpcklbw mm2, mm1 movd mm1, edx ; 219 : __m64 dir = _mm_sub_pi16(col0, col1); ; 220 : __m64 stop01 = DoubleDot(col0, col1, dir); ; 221 : ; 222 : __m64 col2 = LoadPixel(color+2); ; 223 : __m64 col3 = LoadPixel(color+3); mov edx, DWORD PTR [eax+12] mov eax, DWORD PTR [eax+8] movq mm3, mm1 movq mm1, mm0 punpcklbw mm3, mm1 movq mm1, mm2 movq mm4, mm3 psubw mm1, mm4 movq mm4, mm1 pmaddwd mm3, mm4 pmaddwd mm2, mm4 movq mm4, mm2 movq mm5, mm0 movd mm2, edx punpcklbw mm2, mm5 ; 224 : __m64 stop23 = DoubleDot(col2, col3, dir); movq mm5, mm1 pmaddwd mm2, mm5 movd mm5, eax movq mm6, mm0 punpcklbw mm5, mm6 movq mm6, mm1 pmaddwd mm5, mm6 movq mm6, mm5 movq mm7, mm2 punpckldq mm6, mm7 punpckhdq mm5, mm2 paddd mm6, mm5 movq mm2, mm6 movq mm5, mm4 movq mm6, mm3 punpckldq mm5, mm6 punpckhdq mm4, mm3 movq mm3, mm4 paddd mm5, mm3 ; 225 : ; 226 : __m64 c3c0p = AverageInts(stop01, stop23); movq mm3, mm2 paddd mm5, mm3 movq mm3, mm5 psrad mm3, 1 ; 227 : __m64 halfp = AverageInts(_mm_unpackhi_pi32(stop23, stop23), stop23); movq mm4, mm2 movq mm5, mm2 punpckhdq mm4, mm5 paddd mm4, mm2 movq mm2, mm4 psrad mm2, 1 ; 228 : ; 229 : __m64 c0c = _mm_unpackhi_pi32(c3c0p, c3c0p); movq mm4, mm3 movq mm5, mm3 punpckhdq mm4, mm5 movq mm5, mm4 ; 230 : __m64 c3c = _mm_unpacklo_pi32(c3c0p, c3c0p); movq mm4, mm3 punpckldq mm4, mm3 movq MMWORD PTR _c3c$[esp+16], mm4 ; 231 : __m64 halfc = _mm_unpacklo_pi32(halfp, halfp); movq mm3, mm2 punpckldq mm3, mm2 movq MMWORD PTR _halfc$[esp+16], mm3 ; 232 : __m64 mask = _mm_setzero_si64(); movq mm4, mm0 ; 233 : ; 234 : for(sInt i=14;i>=0;i-=2) mov eax, 14 ; 0000000eH npad 1 $LL3@MatchColor: sub eax, 2 ; 235 : { ; 236 : __m64 pix0 = LoadPixel(block + i + 0); ; 237 : __m64 pix1 = LoadPixel(block + i + 1); mov edx, DWORD PTR [ecx+eax*4+12] movd mm2, edx mov edx, DWORD PTR [ecx+eax*4+8] movq mm3, mm0 punpcklbw mm2, mm3 ; 238 : __m64 dots = DoubleDot(pix0, pix1, dir); movq mm3, mm1 pmaddwd mm2, mm3 movq mm6, mm0 movd mm3, edx punpcklbw mm3, mm6 movq mm6, mm1 pmaddwd mm3, mm6 movq mm6, mm3 movq mm7, mm2 punpckldq mm6, mm7 punpckhdq mm3, mm2 paddd mm6, mm3 movq mm2, mm6 ; 239 : ; 240 : mask = _mm_slli_pi32(mask, 4); movq mm3, mm4 pslld mm3, 4 ; 241 : mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(c0c, dots), maskc0)); movq mm4, mm5 pcmpgtd mm4, mm6 movq mm6, MMWORD PTR ?maskc0@?1??MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm4, mm6 paddd mm3, mm4 ; 242 : mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(halfc, dots), maskh)); movq mm4, MMWORD PTR _halfc$[esp+16] movq mm6, mm2 pcmpgtd mm4, mm6 movq mm6, MMWORD PTR ?maskh@?1??MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm4, mm6 paddd mm3, mm4 ; 243 : mask = _mm_add_pi32(mask, _mm_and_si64(_mm_cmpgt_pi32(c3c, dots), maskc3)); movq mm4, MMWORD PTR _c3c$[esp+16] pcmpgtd mm4, mm2 movq mm2, mm4 movq mm4, MMWORD PTR ?maskc3@?1??MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm2, mm4 paddd mm3, mm2 movq mm4, mm3 jns $LL3@MatchColor ; 244 : } ; 245 : ; 246 : mask = _mm_or_si64(mask, _mm_unpackhi_pi32(mask, mask)); ; 247 : blockMask = _mm_cvtsi64_si32(mask); mov ecx, DWORD PTR _blockMask$[ebp] movq mm0, mm4 movq mm1, mm4 movq mm2, mm4 punpckhdq mm1, mm2 por mm0, mm1 movd eax, mm0 mov DWORD PTR [ecx], eax ; 248 : _mm_empty(); emms ; 249 : } mov esp, ebp pop ebp ret 0 ?MatchColorsBlockIntrin@@YAXPBTPixel@@0AAI@Z ENDP ; MatchColorsBlockIntrin _TEXT ENDS // ----------------- implemented by hand ----------------- static void MatchColorsBlockMMX(const Pixel *block,const Pixel *color,sU32 &blockMask) { static const __m64 maskc0 = { 0xfffffff8fffffffe }; static const __m64 maskh = { 0x0000000400000001 }; static const __m64 maskc3 = { 0x0000000800000002 }; __m64 c0c,c3c,halfc; __asm { mov esi, [color]; mov edi, [block]; pxor mm7, mm7; movd mm0, [esi+0]; punpcklbw mm0, mm7; movd mm1, [esi+4]; punpcklbw mm1, mm7; movq mm2, mm0; psubw mm0, mm1; // mm0=dir pmaddwd mm2, mm0; // dp0 pmaddwd mm1, mm0; // dp1 movq mm3, mm2; punpckldq mm2, mm1; punpckhdq mm3, mm1; paddd mm2, mm3; // mm2=stop01 movd mm1, [esi+8]; punpcklbw mm1, mm7; movd mm3, [esi+12]; punpcklbw mm3, mm7; pmaddwd mm1, mm0; // dp0 pmaddwd mm3, mm0; // dp1 movq mm4, mm1; punpckldq mm1, mm3; punpckhdq mm4, mm3; paddd mm1, mm4; // mm1=stop23 movq mm4, mm1; paddd mm2, mm1; punpckhdq mm4, mm4; psrad mm2, 1; // mm2=c3c0p paddd mm4, mm1; movq mm1, mm2; psrad mm4, 1; // mm4=halfp punpckhdq mm1, mm1; // mm1=c0c punpckldq mm2, mm2; // mm2=c3c punpckldq mm4, mm4; // mm4=halfc movq [c0c], mm1; movq [c3c], mm2; movq [halfc], mm4; pxor mm1, mm1; // mm1=mask mov ecx, 14; matchlp: pslld mm1, 4; movd mm2, [edi+ecx*4+0]; punpcklbw mm2, mm7; movd mm3, [edi+ecx*4+4]; punpcklbw mm3, mm7; pmaddwd mm2, mm0; // dp0 pmaddwd mm3, mm0; // dp1 movq mm4, mm2; punpckldq mm2, mm3; punpckhdq mm4, mm3; paddd mm2, mm4; // mm2=dots movq mm3, [c0c]; movq mm4, [halfc]; movq mm5, [c3c]; pcmpgtd mm3, mm2; pcmpgtd mm4, mm2; pcmpgtd mm5, mm2; pand mm3, [maskc0]; pand mm4, [maskh]; pand mm5, [maskc3]; paddd mm1, mm3; paddd mm1, mm4; paddd mm1, mm5; sub ecx, 2; jns matchlp; mov eax, [blockMask]; movq mm0, mm1; punpckhdq mm0, mm0; por mm0, mm1; movd [eax], mm0; emms; } } // ----------------- line-by-line translation of asm code to intrinsics ----------------- static void MatchColorsBlockIntrin2(const Pixel *block,const Pixel *color,sU32 &blockMask) { static const __m64 maskc0 = { 0xfffffff8fffffffe }; static const __m64 maskh = { 0x0000000400000001 }; static const __m64 maskc3 = { 0x0000000800000002 }; register __m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7; __m64 c0c,c3c,halfc; mm7 = _mm_setzero_si64(); mm0 = _mm_cvtsi32_si64(color[0].v); mm0 = _mm_unpacklo_pi8(mm0, mm7); mm1 = _mm_cvtsi32_si64(color[1].v); mm1 = _mm_unpacklo_pi8(mm1, mm7); mm2 = mm0; mm0 = _mm_sub_pi16(mm0, mm1); // mm0=dir mm2 = _mm_madd_pi16(mm2, mm0); // dp0 mm1 = _mm_madd_pi16(mm1, mm0); // dp1 mm3 = mm2; mm2 = _mm_unpacklo_pi32(mm2, mm1); mm3 = _mm_unpackhi_pi32(mm3, mm1); mm2 = _mm_add_pi32(mm2, mm3); // mm2=stop01 mm1 = _mm_cvtsi32_si64(color[2].v); mm1 = _mm_unpacklo_pi8(mm1, mm7); mm3 = _mm_cvtsi32_si64(color[3].v); mm3 = _mm_unpacklo_pi8(mm3, mm7); mm1 = _mm_madd_pi16(mm1, mm0); // dp0 mm3 = _mm_madd_pi16(mm3, mm0); // dp1 mm4 = mm1; mm1 = _mm_unpacklo_pi32(mm1, mm3); mm4 = _mm_unpackhi_pi32(mm4, mm3); mm1 = _mm_add_pi32(mm1, mm4); // mm2=stop23 mm4 = mm1; mm2 = _mm_add_pi32(mm2, mm1); mm4 = _mm_unpackhi_pi32(mm4, mm4); mm2 = _mm_srai_pi32(mm2, 1); // mm2=c3c0p mm4 = _mm_add_pi32(mm4, mm1); mm1 = mm2; mm4 = _mm_srai_pi32(mm4, 1); // mm4=halfp mm1 = _mm_unpackhi_pi32(mm1, mm1); // mm1=c0c mm2 = _mm_unpacklo_pi32(mm2, mm2); // mm2=c3c mm4 = _mm_unpacklo_pi32(mm4, mm4); // mm4=halfc c0c = mm1; c3c = mm2; halfc = mm4; mm1 = _mm_setzero_si64(); for(sInt i=14;i>=0;i-=2) { mm1 = _mm_slli_pi32(mm1, 4); mm2 = _mm_cvtsi32_si64(block[i+0].v); mm2 = _mm_unpacklo_pi8(mm2, mm7); mm3 = _mm_cvtsi32_si64(block[i+1].v); mm3 = _mm_unpacklo_pi8(mm3, mm7); mm2 = _mm_madd_pi16(mm2, mm0); // dp0 mm3 = _mm_madd_pi16(mm3, mm0); // dp1 mm4 = mm2; mm2 = _mm_unpacklo_pi32(mm2, mm3); mm4 = _mm_unpackhi_pi32(mm4, mm3); mm2 = _mm_add_pi32(mm2, mm4); // mm2=dots mm3 = c0c; mm4 = halfc; mm5 = c3c; mm3 = _mm_cmpgt_pi32(mm3, mm2); mm4 = _mm_cmpgt_pi32(mm4, mm2); mm5 = _mm_cmpgt_pi32(mm5, mm2); mm3 = _mm_and_si64(mm3, maskc0); mm4 = _mm_and_si64(mm4, maskh); mm5 = _mm_and_si64(mm5, maskc3); mm1 = _mm_add_pi32(mm1, mm3); mm1 = _mm_add_pi32(mm1, mm4); mm1 = _mm_add_pi32(mm1, mm5); } mm0 = mm1; mm0 = _mm_unpackhi_pi32(mm0, mm0); mm0 = _mm_or_si64(mm0, mm1); blockMask = _mm_cvtsi64_si32(mm0); _mm_empty(); } // ----------------- generated code ----------------- ; COMDAT ?MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z _TEXT SEGMENT _halfc$ = -16 ; size = 8 _c3c$ = -8 ; size = 8 _blockMask$ = 8 ; size = 4 ?MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z PROC ; MatchColorsBlockIntrin2, COMDAT ; _block$ = ecx ; _color$ = eax ; 252 : { push ebp mov ebp, esp and esp, -8 ; fffffff8H sub esp, 16 ; 00000010H ; 253 : static const __m64 maskc0 = { 0xfffffff8fffffffe }; ; 254 : static const __m64 maskh = { 0x0000000400000001 }; ; 255 : static const __m64 maskc3 = { 0x0000000800000002 }; ; 256 : register __m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7; ; 257 : __m64 c0c,c3c,halfc; ; 258 : ; 259 : mm7 = _mm_setzero_si64(); ; 260 : mm0 = _mm_cvtsi32_si64(color[0].v); mov edx, DWORD PTR [eax] pxor mm0, mm0 movd mm1, edx ; 261 : mm0 = _mm_unpacklo_pi8(mm0, mm7); ; 262 : mm1 = _mm_cvtsi32_si64(color[1].v); mov edx, DWORD PTR [eax+4] movq mm2, mm0 punpcklbw mm1, mm2 movd mm2, edx ; 263 : mm1 = _mm_unpacklo_pi8(mm1, mm7); ; 264 : ; 265 : mm2 = mm0; ; 266 : mm0 = _mm_sub_pi16(mm0, mm1); // mm0=dir ; 267 : ; 268 : mm2 = _mm_madd_pi16(mm2, mm0); // dp0 ; 269 : mm1 = _mm_madd_pi16(mm1, mm0); // dp1 ; 270 : mm3 = mm2; ; 271 : mm2 = _mm_unpacklo_pi32(mm2, mm1); ; 272 : mm3 = _mm_unpackhi_pi32(mm3, mm1); ; 273 : mm2 = _mm_add_pi32(mm2, mm3); // mm2=stop01 ; 274 : ; 275 : mm1 = _mm_cvtsi32_si64(color[2].v); mov edx, DWORD PTR [eax+8] ; 276 : mm1 = _mm_unpacklo_pi8(mm1, mm7); ; 277 : mm3 = _mm_cvtsi32_si64(color[3].v); mov eax, DWORD PTR [eax+12] movq mm3, mm0 punpcklbw mm2, mm3 movq mm3, mm1 movq mm4, mm2 psubw mm1, mm4 movq mm4, mm1 pmaddwd mm3, mm4 pmaddwd mm2, mm4 movq mm4, mm3 movq mm5, mm2 punpckldq mm4, mm5 punpckhdq mm3, mm2 paddd mm4, mm3 movq mm3, mm0 movd mm2, edx punpcklbw mm2, mm3 ; 278 : mm3 = _mm_unpacklo_pi8(mm3, mm7); ; 279 : ; 280 : mm1 = _mm_madd_pi16(mm1, mm0); // dp0 movq mm3, mm1 pmaddwd mm2, mm3 movd mm3, eax movq mm5, mm0 punpcklbw mm3, mm5 ; 281 : mm3 = _mm_madd_pi16(mm3, mm0); // dp1 movq mm5, mm1 pmaddwd mm3, mm5 ; 282 : mm4 = mm1; ; 283 : mm1 = _mm_unpacklo_pi32(mm1, mm3); movq mm5, mm2 movq mm6, mm3 punpckldq mm5, mm6 ; 284 : mm4 = _mm_unpackhi_pi32(mm4, mm3); punpckhdq mm2, mm3 ; 285 : mm1 = _mm_add_pi32(mm1, mm4); // mm2=stop23 paddd mm5, mm2 movq mm2, mm5 ; 286 : mm4 = mm1; ; 287 : ; 288 : mm2 = _mm_add_pi32(mm2, mm1); movq mm3, mm4 movq mm4, mm2 paddd mm3, mm4 ; 289 : mm4 = _mm_unpackhi_pi32(mm4, mm4); ; 290 : mm2 = _mm_srai_pi32(mm2, 1); // mm2=c3c0p psrad mm3, 1 punpckhdq mm4, mm5 ; 291 : mm4 = _mm_add_pi32(mm4, mm1); paddd mm4, mm2 movq mm2, mm4 ; 292 : mm1 = mm2; ; 293 : mm4 = _mm_srai_pi32(mm4, 1); // mm4=halfp psrad mm2, 1 ; 294 : ; 295 : mm1 = _mm_unpackhi_pi32(mm1, mm1); // mm1=c0c movq mm4, mm3 movq mm5, mm3 punpckhdq mm4, mm5 ; 296 : mm2 = _mm_unpacklo_pi32(mm2, mm2); // mm2=c3c ; 297 : mm4 = _mm_unpacklo_pi32(mm4, mm4); // mm4=halfc ; 298 : c0c = mm1; movq mm5, mm4 movq mm4, mm3 punpckldq mm4, mm3 ; 299 : c3c = mm2; movq MMWORD PTR _c3c$[esp+16], mm4 movq mm3, mm2 punpckldq mm3, mm2 ; 300 : halfc = mm4; movq MMWORD PTR _halfc$[esp+16], mm3 ; 301 : ; 302 : mm1 = _mm_setzero_si64(); movq mm4, mm0 ; 303 : for(sInt i=14;i>=0;i-=2) mov eax, 14 ; 0000000eH npad 3 $LL3@MatchColor: sub eax, 2 ; 304 : { ; 305 : mm1 = _mm_slli_pi32(mm1, 4); ; 306 : ; 307 : mm2 = _mm_cvtsi32_si64(block[i+0].v); mov edx, DWORD PTR [ecx+eax*4+8] movd mm2, edx ; 308 : mm2 = _mm_unpacklo_pi8(mm2, mm7); ; 309 : mm3 = _mm_cvtsi32_si64(block[i+1].v); mov edx, DWORD PTR [ecx+eax*4+12] movq mm3, mm0 punpcklbw mm2, mm3 ; 310 : mm3 = _mm_unpacklo_pi8(mm3, mm7); ; 311 : ; 312 : mm2 = _mm_madd_pi16(mm2, mm0); // dp0 movq mm3, mm1 pmaddwd mm2, mm3 movq mm6, mm0 movd mm3, edx punpcklbw mm3, mm6 ; 313 : mm3 = _mm_madd_pi16(mm3, mm0); // dp1 movq mm6, mm1 pmaddwd mm3, mm6 ; 314 : mm4 = mm2; ; 315 : mm2 = _mm_unpacklo_pi32(mm2, mm3); movq mm6, mm2 movq mm7, mm3 punpckldq mm6, mm7 ; 316 : mm4 = _mm_unpackhi_pi32(mm4, mm3); punpckhdq mm2, mm3 ; 317 : mm2 = _mm_add_pi32(mm2, mm4); // mm2=dots paddd mm6, mm2 movq mm2, mm6 movq mm3, mm4 pslld mm3, 4 ; 318 : ; 319 : mm3 = c0c; ; 320 : mm4 = halfc; ; 321 : mm5 = c3c; ; 322 : mm3 = _mm_cmpgt_pi32(mm3, mm2); movq mm4, mm5 pcmpgtd mm4, mm6 ; 323 : mm4 = _mm_cmpgt_pi32(mm4, mm2); ; 324 : mm5 = _mm_cmpgt_pi32(mm5, mm2); ; 325 : mm3 = _mm_and_si64(mm3, maskc0); movq mm6, MMWORD PTR ?maskc0@?1??MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm4, mm6 ; 326 : mm4 = _mm_and_si64(mm4, maskh); ; 327 : mm5 = _mm_and_si64(mm5, maskc3); ; 328 : mm1 = _mm_add_pi32(mm1, mm3); paddd mm3, mm4 movq mm4, MMWORD PTR _halfc$[esp+16] movq mm6, mm2 pcmpgtd mm4, mm6 movq mm6, MMWORD PTR ?maskh@?1??MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm4, mm6 ; 329 : mm1 = _mm_add_pi32(mm1, mm4); paddd mm3, mm4 movq mm4, MMWORD PTR _c3c$[esp+16] pcmpgtd mm4, mm2 movq mm2, mm4 movq mm4, MMWORD PTR ?maskc3@?1??MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z@4T__m64@@B pand mm2, mm4 ; 330 : mm1 = _mm_add_pi32(mm1, mm5); paddd mm3, mm2 movq mm4, mm3 jns $LL3@MatchColor ; 331 : } ; 332 : ; 333 : mm0 = mm1; ; 334 : mm0 = _mm_unpackhi_pi32(mm0, mm0); ; 335 : mm0 = _mm_or_si64(mm0, mm1); ; 336 : blockMask = _mm_cvtsi64_si32(mm0); mov ecx, DWORD PTR _blockMask$[ebp] movq mm0, mm4 movq mm1, mm4 punpckhdq mm0, mm1 por mm0, mm1 movd eax, mm0 mov DWORD PTR [ecx], eax ; 337 : _mm_empty(); emms ; 338 : } mov esp, ebp pop ebp ret 0 ?MatchColorsBlockIntrin2@@YAXPBTPixel@@0AAI@Z ENDP ; MatchColorsBlockIntrin2 _TEXT ENDS