Changeset 5633 for pjproject/trunk/third_party/yuv/source/row_gcc.cc
- Timestamp:
- Jul 28, 2017 2:51:44 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/row_gcc.cc
r5358 r5633 1 // VERSION 22 1 /* 3 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. … … 24 23 25 24 // Constants for ARGB 26 static vec8 kARGBToY = { 27 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 25 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, 26 13, 65, 33, 0, 13, 65, 33, 0}; 27 28 // JPeg full range. 29 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 30 15, 75, 38, 0, 15, 75, 38, 0}; 31 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 32 33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 34 35 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 36 112, -74, -38, 0, 112, -74, -38, 0}; 37 38 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 39 127, -84, -43, 0, 127, -84, -43, 0}; 40 41 static vec8 kARGBToV = { 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 28 43 }; 29 44 30 // JPeg full range. 31 static vec8 kARGBToYJ = { 32 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 33 }; 34 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 35 36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 37 38 static vec8 kARGBToU = { 39 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 40 }; 41 42 static vec8 kARGBToUJ = { 43 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 44 }; 45 46 static vec8 kARGBToV = { 47 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 48 }; 49 50 static vec8 kARGBToVJ = { 51 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 52 }; 45 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, 46 -20, -107, 127, 0, -20, -107, 127, 0}; 53 47 54 48 // Constants for BGRA 55 static vec8 kBGRAToY = { 56 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 57 }; 58 59 static vec8 kBGRAToU = { 60 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 61 }; 62 63 static vec8 kBGRAToV = { 64 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 65 }; 49 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, 50 0, 33, 65, 13, 0, 33, 65, 13}; 51 52 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 53 0, -38, -74, 112, 0, -38, -74, 112}; 54 55 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 56 0, 112, -94, -18, 0, 112, -94, -18}; 66 57 67 58 // Constants for ABGR 68 static vec8 kABGRToY = { 69 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 70 }; 71 72 static vec8 kABGRToU = { 73 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 74 }; 75 76 static vec8 kABGRToV = { 77 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 78 }; 59 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, 60 33, 65, 13, 0, 33, 65, 13, 0}; 61 62 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, 63 -38, -74, 112, 0, -38, -74, 112, 0}; 64 65 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 66 112, -94, -18, 0, 112, -94, -18, 0}; 79 67 80 68 // Constants for RGBA. 81 static vec8 kRGBAToY = { 82 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 83 }; 84 85 static vec8 kRGBAToU = { 86 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 87 }; 88 89 static vec8 kRGBAToV = { 90 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 91 }; 92 93 static uvec8 kAddY16 = { 94 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 95 }; 69 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, 70 0, 13, 65, 33, 0, 13, 65, 33}; 71 72 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 73 0, 112, -74, -38, 0, 112, -74, -38}; 74 75 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 76 0, -18, -94, 112, 0, -18, -94, 112}; 77 78 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 79 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; 96 80 97 81 // 7 bit fixed point 0.5. 98 static vec16 kAddYJ64 = { 99 64, 64, 64, 64, 64, 64, 64, 64 100 }; 101 102 static uvec8 kAddUV128 = { 103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 105 }; 106 107 static uvec16 kAddUVJ128 = { 108 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 109 }; 82 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; 83 84 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 85 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 86 87 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 88 0x8080u, 0x8080u, 0x8080u, 0x8080u}; 110 89 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 111 90 … … 113 92 114 93 // Shuffle table for converting RGB24 to ARGB. 115 static uvec8 kShuffleMaskRGB24ToARGB = { 116 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 117 }; 94 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 95 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; 118 96 119 97 // Shuffle table for converting RAW to ARGB. 120 static uvec8 kShuffleMaskRAWToARGB = { 121 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 122 }; 98 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 99 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; 123 100 124 101 // Shuffle table for converting RAW to RGB24. First 8. 125 102 static const uvec8 kShuffleMaskRAWToRGB24_0 = { 126 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 127 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 128 }; 103 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 129 105 130 106 // Shuffle table for converting RAW to RGB24. Middle 8. 131 107 static const uvec8 kShuffleMaskRAWToRGB24_1 = { 132 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 133 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 134 }; 108 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 135 110 136 111 // Shuffle table for converting RAW to RGB24. Last 8. 137 112 static const uvec8 kShuffleMaskRAWToRGB24_2 = { 138 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 139 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 140 }; 113 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 114 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 141 115 142 116 // Shuffle table for converting ARGB to RGB24. 143 117 static uvec8 kShuffleMaskARGBToRGB24 = { 144 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 145 }; 118 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; 146 119 147 120 // Shuffle table for converting ARGB to RAW. 148 121 static uvec8 kShuffleMaskARGBToRAW = { 149 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 150 }; 122 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; 151 123 152 124 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 153 125 static uvec8 kShuffleMaskARGBToRGB24_0 = { 154 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 155 }; 126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; 156 127 157 128 // YUY2 shuf 16 Y to 32 Y. 158 static const lvec8 kShuffleYUY2Y = { 159 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 160 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 161 }; 129 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 130 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 131 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 162 132 163 133 // YUY2 shuf 8 UV to 16 UV. 164 static const lvec8 kShuffleYUY2UV = { 165 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 166 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 167 }; 134 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 135 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 136 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; 168 137 169 138 // UYVY shuf 16 Y to 32 Y. 170 static const lvec8 kShuffleUYVYY = { 171 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 172 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 173 }; 139 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 140 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 141 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; 174 142 175 143 // UYVY shuf 8 UV to 16 UV. 176 static const lvec8 kShuffleUYVYUV = { 177 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 178 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 179 }; 144 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 145 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 146 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; 180 147 181 148 // NV21 shuf 8 VU to 16 UV. 182 149 static const lvec8 kShuffleNV21 = { 183 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,184 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,150 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 185 152 }; 186 153 #endif // HAS_RGB24TOARGBROW_SSSE3 … … 192 159 "pslld $0x18,%%xmm5 \n" 193 160 LABELALIGN 194 "1:\n"161 "1: \n" 195 162 "movq " MEMACCESS(0) ",%%xmm0 \n" 196 163 "lea " MEMLEA(0x8,0) ",%0 \n" … … 221 188 "movdqa %3,%%xmm4 \n" 222 189 LABELALIGN 223 "1:\n"190 "1: \n" 224 191 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 225 192 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 259 226 "movdqa %3,%%xmm4 \n" 260 227 LABELALIGN 261 "1:\n"228 "1: \n" 262 229 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 263 230 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 297 264 "movdqa %5,%%xmm5 \n" 298 265 LABELALIGN 299 "1:\n"266 "1: \n" 300 267 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 301 268 "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" … … 339 306 "sub %0,%1 \n" 340 307 LABELALIGN 341 "1:\n"308 "1: \n" 342 309 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 343 310 "movdqa %%xmm0,%%xmm1 \n" … … 386 353 "sub %0,%1 \n" 387 354 LABELALIGN 388 "1:\n"355 "1: \n" 389 356 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 390 357 "movdqa %%xmm0,%%xmm1 \n" … … 430 397 "sub %0,%1 \n" 431 398 LABELALIGN 432 "1:\n"399 "1: \n" 433 400 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 434 401 "movdqa %%xmm0,%%xmm2 \n" … … 462 429 "movdqa %3,%%xmm6 \n" 463 430 LABELALIGN 464 "1:\n"431 "1: \n" 465 432 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 466 433 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 500 467 "movdqa %3,%%xmm6 \n" 501 468 LABELALIGN 502 "1:\n"469 "1: \n" 503 470 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 504 471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 544 511 "pslld $0xb,%%xmm5 \n" 545 512 LABELALIGN 546 "1:\n"513 "1: \n" 547 514 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 548 515 "movdqa %%xmm0,%%xmm1 \n" … … 570 537 } 571 538 572 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, 573 const uint32 dither4, int width) { 574 asm volatile ( 575 "movd %3,%%xmm6 \n" 576 "punpcklbw %%xmm6,%%xmm6 \n" 577 "movdqa %%xmm6,%%xmm7 \n" 578 "punpcklwd %%xmm6,%%xmm6 \n" 579 "punpckhwd %%xmm7,%%xmm7 \n" 580 "pcmpeqb %%xmm3,%%xmm3 \n" 581 "psrld $0x1b,%%xmm3 \n" 582 "pcmpeqb %%xmm4,%%xmm4 \n" 583 "psrld $0x1a,%%xmm4 \n" 584 "pslld $0x5,%%xmm4 \n" 585 "pcmpeqb %%xmm5,%%xmm5 \n" 586 "pslld $0xb,%%xmm5 \n" 587 588 LABELALIGN 589 "1: \n" 590 "movdqu (%0),%%xmm0 \n" 591 "paddusb %%xmm6,%%xmm0 \n" 592 "movdqa %%xmm0,%%xmm1 \n" 593 "movdqa %%xmm0,%%xmm2 \n" 594 "pslld $0x8,%%xmm0 \n" 595 "psrld $0x3,%%xmm1 \n" 596 "psrld $0x5,%%xmm2 \n" 597 "psrad $0x10,%%xmm0 \n" 598 "pand %%xmm3,%%xmm1 \n" 599 "pand %%xmm4,%%xmm2 \n" 600 "pand %%xmm5,%%xmm0 \n" 601 "por %%xmm2,%%xmm1 \n" 602 "por %%xmm1,%%xmm0 \n" 603 "packssdw %%xmm0,%%xmm0 \n" 604 "lea 0x10(%0),%0 \n" 605 "movq %%xmm0,(%1) \n" 606 "lea 0x8(%1),%1 \n" 607 "sub $0x4,%2 \n" 608 "jg 1b \n" 609 : "+r"(src), // %0 610 "+r"(dst), // %1 611 "+r"(width) // %2 612 : "m"(dither4) // %3 613 : "memory", "cc", 614 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 615 ); 539 void ARGBToRGB565DitherRow_SSE2(const uint8* src, 540 uint8* dst, 541 const uint32 dither4, 542 int width) { 543 asm volatile( 544 "movd %3,%%xmm6 \n" 545 "punpcklbw %%xmm6,%%xmm6 \n" 546 "movdqa %%xmm6,%%xmm7 \n" 547 "punpcklwd %%xmm6,%%xmm6 \n" 548 "punpckhwd %%xmm7,%%xmm7 \n" 549 "pcmpeqb %%xmm3,%%xmm3 \n" 550 "psrld $0x1b,%%xmm3 \n" 551 "pcmpeqb %%xmm4,%%xmm4 \n" 552 "psrld $0x1a,%%xmm4 \n" 553 "pslld $0x5,%%xmm4 \n" 554 "pcmpeqb %%xmm5,%%xmm5 \n" 555 "pslld $0xb,%%xmm5 \n" 556 557 LABELALIGN 558 "1: \n" 559 "movdqu (%0),%%xmm0 \n" 560 "paddusb %%xmm6,%%xmm0 \n" 561 "movdqa %%xmm0,%%xmm1 \n" 562 "movdqa %%xmm0,%%xmm2 \n" 563 "pslld $0x8,%%xmm0 \n" 564 "psrld $0x3,%%xmm1 \n" 565 "psrld $0x5,%%xmm2 \n" 566 "psrad $0x10,%%xmm0 \n" 567 "pand %%xmm3,%%xmm1 \n" 568 "pand %%xmm4,%%xmm2 \n" 569 "pand %%xmm5,%%xmm0 \n" 570 "por %%xmm2,%%xmm1 \n" 571 "por %%xmm1,%%xmm0 \n" 572 "packssdw %%xmm0,%%xmm0 \n" 573 "lea 0x10(%0),%0 \n" 574 "movq %%xmm0,(%1) \n" 575 "lea 0x8(%1),%1 \n" 576 "sub $0x4,%2 \n" 577 "jg 1b \n" 578 : "+r"(src), // %0 579 "+r"(dst), // %1 580 "+r"(width) // %2 581 : "m"(dither4) // %3 582 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 583 "xmm7"); 616 584 } 617 585 618 586 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 619 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, 620 const uint32 dither4, int width) { 621 asm volatile ( 622 "vbroadcastss %3,%%xmm6 \n" 623 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 624 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 625 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 626 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 627 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 628 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 629 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 630 "vpslld $0x5,%%ymm4,%%ymm4 \n" 631 "vpslld $0xb,%%ymm3,%%ymm5 \n" 632 633 LABELALIGN 634 "1: \n" 635 "vmovdqu (%0),%%ymm0 \n" 636 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" 637 "vpsrld $0x5,%%ymm0,%%ymm2 \n" 638 "vpsrld $0x3,%%ymm0,%%ymm1 \n" 639 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 640 "vpand %%ymm4,%%ymm2,%%ymm2 \n" 641 "vpand %%ymm3,%%ymm1,%%ymm1 \n" 642 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 643 "vpor %%ymm2,%%ymm1,%%ymm1 \n" 644 "vpor %%ymm1,%%ymm0,%%ymm0 \n" 645 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 646 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 647 "lea 0x20(%0),%0 \n" 648 "vmovdqu %%xmm0,(%1) \n" 649 "lea 0x10(%1),%1 \n" 650 "sub $0x8,%2 \n" 651 "jg 1b \n" 652 "vzeroupper \n" 653 : "+r"(src), // %0 654 "+r"(dst), // %1 655 "+r"(width) // %2 656 : "m"(dither4) // %3 657 : "memory", "cc", 658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 659 ); 587 void ARGBToRGB565DitherRow_AVX2(const uint8* src, 588 uint8* dst, 589 const uint32 dither4, 590 int width) { 591 asm volatile( 592 "vbroadcastss %3,%%xmm6 \n" 593 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 594 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 595 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 596 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 597 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 598 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 599 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 600 "vpslld $0x5,%%ymm4,%%ymm4 \n" 601 "vpslld $0xb,%%ymm3,%%ymm5 \n" 602 603 LABELALIGN 604 "1: \n" 605 "vmovdqu (%0),%%ymm0 \n" 606 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" 607 "vpsrld $0x5,%%ymm0,%%ymm2 \n" 608 "vpsrld $0x3,%%ymm0,%%ymm1 \n" 609 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 610 "vpand %%ymm4,%%ymm2,%%ymm2 \n" 611 "vpand %%ymm3,%%ymm1,%%ymm1 \n" 612 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 613 "vpor %%ymm2,%%ymm1,%%ymm1 \n" 614 "vpor %%ymm1,%%ymm0,%%ymm0 \n" 615 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 616 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 617 "lea 0x20(%0),%0 \n" 618 "vmovdqu %%xmm0,(%1) \n" 619 "lea 0x10(%1),%1 \n" 620 "sub $0x8,%2 \n" 621 "jg 1b \n" 622 "vzeroupper \n" 623 : "+r"(src), // %0 624 "+r"(dst), // %1 625 "+r"(width) // %2 626 : "m"(dither4) // %3 627 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 628 "xmm7"); 660 629 } 661 630 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 662 663 631 664 632 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { … … 672 640 "pcmpeqb %%xmm7,%%xmm7 \n" 673 641 "pslld $0xf,%%xmm7 \n" 674 LABELALIGN 675 "1: \n" 642 643 LABELALIGN 644 "1: \n" 676 645 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 677 646 "movdqa %%xmm0,%%xmm1 \n" … … 709 678 "movdqa %%xmm4,%%xmm3 \n" 710 679 "psrlw $0x8,%%xmm3 \n" 711 LABELALIGN 712 "1: \n" 680 681 LABELALIGN 682 "1: \n" 713 683 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 714 684 "movdqa %%xmm0,%%xmm1 \n" … … 738 708 "movdqa %3,%%xmm4 \n" 739 709 "movdqa %4,%%xmm5 \n" 740 LABELALIGN 741 "1: \n" 710 711 LABELALIGN 712 "1: \n" 742 713 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 743 714 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 776 747 "movdqa %3,%%xmm4 \n" 777 748 "movdqa %4,%%xmm5 \n" 778 LABELALIGN 779 "1: \n" 749 750 LABELALIGN 751 "1: \n" 780 752 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 781 753 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 810 782 #ifdef HAS_ARGBTOYROW_AVX2 811 783 // vpermd for vphaddw + vpackuswb vpermd. 812 static const lvec32 kPermdARGBToY_AVX = { 813 0, 4, 1, 5, 2, 6, 3, 7 814 }; 784 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; 815 785 816 786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. … … 820 790 "vbroadcastf128 %4,%%ymm5 \n" 821 791 "vmovdqu %5,%%ymm6 \n" 822 LABELALIGN 823 "1: \n" 792 793 LABELALIGN 794 "1: \n" 824 795 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 825 796 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 861 832 "vbroadcastf128 %4,%%ymm5 \n" 862 833 "vmovdqu %5,%%ymm6 \n" 863 LABELALIGN 864 "1: \n" 834 835 LABELALIGN 836 "1: \n" 865 837 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 866 838 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 897 869 898 870 #ifdef HAS_ARGBTOUVROW_SSSE3 899 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 900 uint8* dst_u, uint8* dst_v, int width) { 871 void ARGBToUVRow_SSSE3(const uint8* src_argb0, 872 int src_stride_argb, 873 uint8* dst_u, 874 uint8* dst_v, 875 int width) { 901 876 asm volatile ( 902 877 "movdqa %5,%%xmm3 \n" … … 904 879 "movdqa %7,%%xmm5 \n" 905 880 "sub %1,%2 \n" 906 LABELALIGN 907 "1: \n" 881 882 LABELALIGN 883 "1: \n" 908 884 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 909 885 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 … … 962 938 // vpshufb for vphaddw + vpackuswb packed to shorts. 963 939 static const lvec8 kShufARGBToUV_AVX = { 964 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 965 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 966 }; 967 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 968 uint8* dst_u, uint8* dst_v, int width) { 940 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 941 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; 942 void ARGBToUVRow_AVX2(const uint8* src_argb0, 943 int src_stride_argb, 944 uint8* dst_u, 945 uint8* dst_v, 946 int width) { 969 947 asm volatile ( 970 948 "vbroadcastf128 %5,%%ymm5 \n" 971 949 "vbroadcastf128 %6,%%ymm6 \n" 972 950 "vbroadcastf128 %7,%%ymm7 \n" 973 "sub %1,%2 \n" 974 LABELALIGN 975 "1: \n" 951 "sub %1,%2 \n" 952 953 LABELALIGN 954 "1: \n" 955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 957 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 958 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 959 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 960 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 961 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) 962 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) 963 "lea " MEMLEA(0x80,0) ",%0 \n" 964 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" 965 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" 966 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" 967 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" 968 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" 969 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" 970 971 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" 972 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" 973 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" 974 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" 975 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 976 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 977 "vpsraw $0x8,%%ymm1,%%ymm1 \n" 978 "vpsraw $0x8,%%ymm0,%%ymm0 \n" 979 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" 980 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 981 "vpshufb %8,%%ymm0,%%ymm0 \n" 982 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" 983 984 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" 985 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) 986 "lea " MEMLEA(0x10,1) ",%1 \n" 987 "sub $0x20,%3 \n" 988 "jg 1b \n" 989 "vzeroupper \n" 990 : "+r"(src_argb0), // %0 991 "+r"(dst_u), // %1 992 "+r"(dst_v), // %2 993 "+rm"(width) // %3 994 : "r"((intptr_t)(src_stride_argb)), // %4 995 "m"(kAddUV128), // %5 996 "m"(kARGBToV), // %6 997 "m"(kARGBToU), // %7 998 "m"(kShufARGBToUV_AVX) // %8 999 : "memory", "cc", NACL_R14 1000 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1001 ); 1002 } 1003 #endif // HAS_ARGBTOUVROW_AVX2 1004 1005 #ifdef HAS_ARGBTOUVJROW_AVX2 1006 void ARGBToUVJRow_AVX2(const uint8* src_argb0, 1007 int src_stride_argb, 1008 uint8* dst_u, 1009 uint8* dst_v, 1010 int width) { 1011 asm volatile ( 1012 "vbroadcastf128 %5,%%ymm5 \n" 1013 "vbroadcastf128 %6,%%ymm6 \n" 1014 "vbroadcastf128 %7,%%ymm7 \n" 1015 "sub %1,%2 \n" 1016 1017 LABELALIGN 1018 "1: \n" 976 1019 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 977 1020 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 996 1039 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 997 1040 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 998 "vpsraw $0x8,%%ymm1,%%ymm1 \n"999 "vpsraw $0x8,%%ymm0,%%ymm0 \n"1000 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"1001 "vpermq $0xd8,%%ymm0,%%ymm0 \n"1002 "vpshufb %8,%%ymm0,%%ymm0 \n"1003 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"1004 1005 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"1006 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)1007 "lea " MEMLEA(0x10,1) ",%1 \n"1008 "sub $0x20,%3 \n"1009 "jg 1b \n"1010 "vzeroupper \n"1011 : "+r"(src_argb0), // %01012 "+r"(dst_u), // %11013 "+r"(dst_v), // %21014 "+rm"(width) // %31015 : "r"((intptr_t)(src_stride_argb)), // %41016 "m"(kAddUV128), // %51017 "m"(kARGBToV), // %61018 "m"(kARGBToU), // %71019 "m"(kShufARGBToUV_AVX) // %81020 : "memory", "cc", NACL_R141021 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"1022 );1023 }1024 #endif // HAS_ARGBTOUVROW_AVX21025 1026 #ifdef HAS_ARGBTOUVJROW_AVX21027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,1028 uint8* dst_u, uint8* dst_v, int width) {1029 asm volatile (1030 "vbroadcastf128 %5,%%ymm5 \n"1031 "vbroadcastf128 %6,%%ymm6 \n"1032 "vbroadcastf128 %7,%%ymm7 \n"1033 "sub %1,%2 \n"1034 LABELALIGN1035 "1: \n"1036 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"1037 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"1038 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"1039 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"1040 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm01041 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)1042 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)1043 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)1044 "lea " MEMLEA(0x80,0) ",%0 \n"1045 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"1046 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"1047 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"1048 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"1049 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"1050 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"1051 1052 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"1053 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"1054 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"1055 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"1056 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"1057 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"1058 1041 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" 1059 1042 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" … … 1086 1069 1087 1070 #ifdef HAS_ARGBTOUVJROW_SSSE3 1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1089 uint8* dst_u, uint8* dst_v, int width) { 1071 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, 1072 int src_stride_argb, 1073 uint8* dst_u, 1074 uint8* dst_v, 1075 int width) { 1090 1076 asm volatile ( 1091 1077 "movdqa %5,%%xmm3 \n" … … 1093 1079 "movdqa %7,%%xmm5 \n" 1094 1080 "sub %1,%2 \n" 1095 LABELALIGN 1096 "1: \n" 1081 1082 LABELALIGN 1083 "1: \n" 1097 1084 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1098 1085 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 … … 1150 1137 1151 1138 #ifdef HAS_ARGBTOUV444ROW_SSSE3 1152 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1139 void ARGBToUV444Row_SSSE3(const uint8* src_argb, 1140 uint8* dst_u, 1141 uint8* dst_v, 1153 1142 int width) { 1154 1143 asm volatile ( … … 1157 1146 "movdqa %6,%%xmm5 \n" 1158 1147 "sub %1,%2 \n" 1159 LABELALIGN 1160 "1: \n" 1148 1149 LABELALIGN 1150 "1: \n" 1161 1151 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1162 1152 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 1210 1200 "movdqa %4,%%xmm5 \n" 1211 1201 "movdqa %3,%%xmm4 \n" 1212 LABELALIGN 1213 "1: \n" 1202 1203 LABELALIGN 1204 "1: \n" 1214 1205 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1215 1206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 1240 1231 } 1241 1232 1242 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1243 uint8* dst_u, uint8* dst_v, int width) { 1233 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, 1234 int src_stride_bgra, 1235 uint8* dst_u, 1236 uint8* dst_v, 1237 int width) { 1244 1238 asm volatile ( 1245 1239 "movdqa %5,%%xmm3 \n" … … 1247 1241 "movdqa %7,%%xmm5 \n" 1248 1242 "sub %1,%2 \n" 1249 LABELALIGN 1250 "1: \n" 1243 1244 LABELALIGN 1245 "1: \n" 1251 1246 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1252 1247 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 … … 1305 1300 "movdqa %4,%%xmm5 \n" 1306 1301 "movdqa %3,%%xmm4 \n" 1307 LABELALIGN 1308 "1: \n" 1302 1303 LABELALIGN 1304 "1: \n" 1309 1305 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1310 1306 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 1339 1335 "movdqa %4,%%xmm5 \n" 1340 1336 "movdqa %3,%%xmm4 \n" 1341 LABELALIGN 1342 "1: \n" 1337 1338 LABELALIGN 1339 "1: \n" 1343 1340 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1344 1341 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 1369 1366 } 1370 1367 1371 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1372 uint8* dst_u, uint8* dst_v, int width) { 1368 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, 1369 int src_stride_abgr, 1370 uint8* dst_u, 1371 uint8* dst_v, 1372 int width) { 1373 1373 asm volatile ( 1374 1374 "movdqa %5,%%xmm3 \n" … … 1376 1376 "movdqa %7,%%xmm5 \n" 1377 1377 "sub %1,%2 \n" 1378 LABELALIGN 1379 "1: \n" 1378 1379 LABELALIGN 1380 "1: \n" 1380 1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1381 1382 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 … … 1430 1431 } 1431 1432 1432 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1433 uint8* dst_u, uint8* dst_v, int width) { 1433 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, 1434 int src_stride_rgba, 1435 uint8* dst_u, 1436 uint8* dst_v, 1437 int width) { 1434 1438 asm volatile ( 1435 1439 "movdqa %5,%%xmm3 \n" … … 1437 1441 "movdqa %7,%%xmm5 \n" 1438 1442 "sub %1,%2 \n" 1439 LABELALIGN 1440 "1: \n" 1443 1444 LABELALIGN 1445 "1: \n" 1441 1446 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1442 1447 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 … … 1494 1499 1495 1500 // Read 8 UV from 444 1496 #define READYUV444 1497 "movq " MEMACCESS([u_buf]) ",%%xmm0\n" \1501 #define READYUV444 \ 1502 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1498 1503 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1499 1504 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ … … 1504 1509 1505 1510 // Read 4 UV from 422, upsample to 8 UV 1506 #define READYUV422 1507 "movd " MEMACCESS([u_buf]) ",%%xmm0\n" \1511 #define READYUV422 \ 1512 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1508 1513 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1509 1514 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ … … 1515 1520 1516 1521 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 1517 #define READYUVA422 1518 "movd " MEMACCESS([u_buf]) ",%%xmm0\n" \1522 #define READYUVA422 \ 1523 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1519 1524 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1520 1525 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ … … 1527 1532 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" 1528 1533 1529 // Read 2 UV from 411, upsample to 8 UV.1530 // reading 4 bytes is an msan violation.1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)1533 // pinsrw fails with drmemory1534 // __asm pinsrw xmm0, [esi], 0 /* U */1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */1536 #define READYUV411_TEMP \1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \1538 "movd %[temp],%%xmm0 \n" \1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \1540 "movd %[temp],%%xmm1 \n" \1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \1542 "punpcklbw %%xmm1,%%xmm0 \n" \1543 "punpcklwd %%xmm0,%%xmm0 \n" \1544 "punpckldq %%xmm0,%%xmm0 \n" \1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \1546 "punpcklbw %%xmm4,%%xmm4 \n" \1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"1548 1549 1534 // Read 4 UV from NV12, upsample to 8 UV 1550 #define READNV12 1551 "movq " MEMACCESS([uv_buf]) ",%%xmm0\n" \1535 #define READNV12 \ 1536 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1552 1537 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1553 1538 "punpcklwd %%xmm0,%%xmm0 \n" \ … … 1557 1542 1558 1543 // Read 4 VU from NV21, upsample to 8 UV 1559 #define READNV21 1560 "movq " MEMACCESS([vu_buf]) ",%%xmm0\n" \1544 #define READNV21 \ 1545 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1561 1546 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ 1562 1547 "pshufb %[kShuffleNV21], %%xmm0 \n" \ … … 1566 1551 1567 1552 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1568 #define READYUY2 1569 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4\n" \1553 #define READYUY2 \ 1554 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ 1570 1555 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ 1571 1556 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ … … 1574 1559 1575 1560 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1576 #define READUYVY 1577 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4\n" \1561 #define READUYVY \ 1562 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1578 1563 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1579 1564 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ … … 1582 1567 1583 1568 #if defined(__x86_64__) 1584 #define YUVTORGB_SETUP(yuvconstants) 1585 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8\n" \1569 #define YUVTORGB_SETUP(yuvconstants) \ 1570 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ 1586 1571 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ 1587 1572 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ … … 1591 1576 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" 1592 1577 // Convert 8 pixels: 8 UV and 8 Y 1593 #define YUVTORGB(yuvconstants) 1594 "movdqa %%xmm0,%%xmm1 \n"\1595 "movdqa %%xmm0,%%xmm2 \n"\1596 "movdqa %%xmm0,%%xmm3 \n"\1597 "movdqa %%xmm11,%%xmm0 \n"\1598 "pmaddubsw %%xmm8,%%xmm1 \n"\1599 "psubw %%xmm1,%%xmm0 \n"\1600 "movdqa %%xmm12,%%xmm1 \n"\1601 "pmaddubsw %%xmm9,%%xmm2 \n"\1602 "psubw %%xmm2,%%xmm1 \n"\1603 "movdqa %%xmm13,%%xmm2 \n"\1604 "pmaddubsw %%xmm10,%%xmm3 \n"\1605 "psubw %%xmm3,%%xmm2 \n"\1606 "pmulhuw %%xmm14,%%xmm4 \n"\1607 "paddsw %%xmm4,%%xmm0 \n"\1608 "paddsw %%xmm4,%%xmm1 \n"\1609 "paddsw %%xmm4,%%xmm2 \n"\1610 "psraw $0x6,%%xmm0 \n"\1611 "psraw $0x6,%%xmm1 \n"\1612 "psraw $0x6,%%xmm2 \n"\1613 "packuswb %%xmm0,%%xmm0 \n"\1614 "packuswb %%xmm1,%%xmm1 \n"\1615 1578 #define YUVTORGB(yuvconstants) \ 1579 "movdqa %%xmm0,%%xmm1 \n" \ 1580 "movdqa %%xmm0,%%xmm2 \n" \ 1581 "movdqa %%xmm0,%%xmm3 \n" \ 1582 "movdqa %%xmm11,%%xmm0 \n" \ 1583 "pmaddubsw %%xmm8,%%xmm1 \n" \ 1584 "psubw %%xmm1,%%xmm0 \n" \ 1585 "movdqa %%xmm12,%%xmm1 \n" \ 1586 "pmaddubsw %%xmm9,%%xmm2 \n" \ 1587 "psubw %%xmm2,%%xmm1 \n" \ 1588 "movdqa %%xmm13,%%xmm2 \n" \ 1589 "pmaddubsw %%xmm10,%%xmm3 \n" \ 1590 "psubw %%xmm3,%%xmm2 \n" \ 1591 "pmulhuw %%xmm14,%%xmm4 \n" \ 1592 "paddsw %%xmm4,%%xmm0 \n" \ 1593 "paddsw %%xmm4,%%xmm1 \n" \ 1594 "paddsw %%xmm4,%%xmm2 \n" \ 1595 "psraw $0x6,%%xmm0 \n" \ 1596 "psraw $0x6,%%xmm1 \n" \ 1597 "psraw $0x6,%%xmm2 \n" \ 1598 "packuswb %%xmm0,%%xmm0 \n" \ 1599 "packuswb %%xmm1,%%xmm1 \n" \ 1600 "packuswb %%xmm2,%%xmm2 \n" 1616 1601 #define YUVTORGB_REGS \ 1617 1602 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 1618 1603 1619 1604 #else 1620 1605 #define YUVTORGB_SETUP(yuvconstants) 1621 1606 // Convert 8 pixels: 8 UV and 8 Y 1622 #define YUVTORGB(yuvconstants) 1623 "movdqa %%xmm0,%%xmm1\n" \1607 #define YUVTORGB(yuvconstants) \ 1608 "movdqa %%xmm0,%%xmm1 \n" \ 1624 1609 "movdqa %%xmm0,%%xmm2 \n" \ 1625 1610 "movdqa %%xmm0,%%xmm3 \n" \ … … 1647 1632 1648 1633 // Store 8 ARGB values. 1649 #define STOREARGB 1650 "punpcklbw %%xmm1,%%xmm0\n" \1634 #define STOREARGB \ 1635 "punpcklbw %%xmm1,%%xmm0 \n" \ 1651 1636 "punpcklbw %%xmm5,%%xmm2 \n" \ 1652 1637 "movdqa %%xmm0,%%xmm1 \n" \ … … 1658 1643 1659 1644 // Store 8 RGBA values. 1660 #define STORERGBA 1661 "pcmpeqb %%xmm5,%%xmm5\n" \1645 #define STORERGBA \ 1646 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1662 1647 "punpcklbw %%xmm2,%%xmm1 \n" \ 1663 1648 "punpcklbw %%xmm0,%%xmm5 \n" \ … … 1679 1664 "sub %[u_buf],%[v_buf] \n" 1680 1665 "pcmpeqb %%xmm5,%%xmm5 \n" 1681 LABELALIGN 1682 "1: \n" 1666 1667 LABELALIGN 1668 "1: \n" 1683 1669 READYUV444 1684 1670 YUVTORGB(yuvconstants) … … 1708 1694 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1709 1695 "sub %[u_buf],%[v_buf] \n" 1710 LABELALIGN 1711 "1: \n" 1696 1697 LABELALIGN 1698 "1: \n" 1712 1699 READYUV422 1713 1700 YUVTORGB(yuvconstants) … … 1729 1716 [v_buf]"+r"(v_buf), // %[v_buf] 1730 1717 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 1731 #if defined(__i386__) && defined(__pic__)1718 #if defined(__i386__) 1732 1719 [width]"+m"(width) // %[width] 1733 1720 #else … … 1752 1739 "sub %[u_buf],%[v_buf] \n" 1753 1740 "pcmpeqb %%xmm5,%%xmm5 \n" 1754 LABELALIGN 1755 "1: \n" 1741 1742 LABELALIGN 1743 "1: \n" 1756 1744 READYUV422 1757 1745 YUVTORGB(yuvconstants) … … 1778 1766 const struct YuvConstants* yuvconstants, 1779 1767 int width) { 1768 // clang-format off 1780 1769 asm volatile ( 1781 1770 YUVTORGB_SETUP(yuvconstants) 1782 1771 "sub %[u_buf],%[v_buf] \n" 1783 LABELALIGN 1784 "1: \n" 1772 1773 LABELALIGN 1774 "1: \n" 1785 1775 READYUVA422 1786 1776 YUVTORGB(yuvconstants) … … 1793 1783 [a_buf]"+r"(a_buf), // %[a_buf] 1794 1784 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1795 #if defined(__i386__) && defined(__pic__)1785 #if defined(__i386__) 1796 1786 [width]"+m"(width) // %[width] 1797 1787 #else … … 1802 1792 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1803 1793 ); 1794 // clang-format on 1804 1795 } 1805 1796 #endif // HAS_I422ALPHATOARGBROW_SSSE3 1806 1807 #ifdef HAS_I411TOARGBROW_SSSE31808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,1809 const uint8* u_buf,1810 const uint8* v_buf,1811 uint8* dst_argb,1812 const struct YuvConstants* yuvconstants,1813 int width) {1814 int temp;1815 asm volatile (1816 YUVTORGB_SETUP(yuvconstants)1817 "sub %[u_buf],%[v_buf] \n"1818 "pcmpeqb %%xmm5,%%xmm5 \n"1819 LABELALIGN1820 "1: \n"1821 READYUV411_TEMP1822 YUVTORGB(yuvconstants)1823 STOREARGB1824 "subl $0x8,%[width] \n"1825 "jg 1b \n"1826 : [y_buf]"+r"(y_buf), // %[y_buf]1827 [u_buf]"+r"(u_buf), // %[u_buf]1828 [v_buf]"+r"(v_buf), // %[v_buf]1829 [dst_argb]"+r"(dst_argb), // %[dst_argb]1830 [temp]"=&r"(temp), // %[temp]1831 #if defined(__i386__) && defined(__pic__)1832 [width]"+m"(width) // %[width]1833 #else1834 [width]"+rm"(width) // %[width]1835 #endif1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"1839 );1840 }1841 #endif1842 1797 1843 1798 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, … … 1846 1801 const struct YuvConstants* yuvconstants, 1847 1802 int width) { 1803 // clang-format off 1848 1804 asm volatile ( 1849 1805 YUVTORGB_SETUP(yuvconstants) 1850 1806 "pcmpeqb %%xmm5,%%xmm5 \n" 1851 LABELALIGN 1852 "1: \n" 1807 1808 LABELALIGN 1809 "1: \n" 1853 1810 READNV12 1854 1811 YUVTORGB(yuvconstants) … … 1864 1821 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1865 1822 ); 1823 // clang-format on 1866 1824 } 1867 1825 … … 1871 1829 const struct YuvConstants* yuvconstants, 1872 1830 int width) { 1831 // clang-format off 1873 1832 asm volatile ( 1874 1833 YUVTORGB_SETUP(yuvconstants) 1875 1834 "pcmpeqb %%xmm5,%%xmm5 \n" 1876 LABELALIGN 1877 "1: \n" 1835 1836 LABELALIGN 1837 "1: \n" 1878 1838 READNV21 1879 1839 YUVTORGB(yuvconstants) … … 1890 1850 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1891 1851 ); 1852 // clang-format on 1892 1853 } 1893 1854 … … 1896 1857 const struct YuvConstants* yuvconstants, 1897 1858 int width) { 1859 // clang-format off 1898 1860 asm volatile ( 1899 1861 YUVTORGB_SETUP(yuvconstants) 1900 1862 "pcmpeqb %%xmm5,%%xmm5 \n" 1901 LABELALIGN 1902 "1: \n" 1863 1864 LABELALIGN 1865 "1: \n" 1903 1866 READYUY2 1904 1867 YUVTORGB(yuvconstants) … … 1915 1878 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1916 1879 ); 1880 // clang-format on 1917 1881 } 1918 1882 … … 1921 1885 const struct YuvConstants* yuvconstants, 1922 1886 int width) { 1887 // clang-format off 1923 1888 asm volatile ( 1924 1889 YUVTORGB_SETUP(yuvconstants) 1925 1890 "pcmpeqb %%xmm5,%%xmm5 \n" 1926 LABELALIGN 1927 "1: \n" 1891 1892 LABELALIGN 1893 "1: \n" 1928 1894 READUYVY 1929 1895 YUVTORGB(yuvconstants) … … 1940 1906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1941 1907 ); 1908 // clang-format on 1942 1909 } 1943 1910 … … 1952 1919 "sub %[u_buf],%[v_buf] \n" 1953 1920 "pcmpeqb %%xmm5,%%xmm5 \n" 1954 LABELALIGN 1955 "1: \n" 1921 1922 LABELALIGN 1923 "1: \n" 1956 1924 READYUV422 1957 1925 YUVTORGB(yuvconstants) … … 1973 1941 1974 1942 // Read 16 UV from 444 1975 #define READYUV444_AVX2 1976 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0\n" \1943 #define READYUV444_AVX2 \ 1944 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1977 1945 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1978 1946 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ … … 1986 1954 1987 1955 // Read 8 UV from 422, upsample to 16 UV. 1988 #define READYUV422_AVX2 1989 "vmovq " MEMACCESS([u_buf]) ",%%xmm0\n" \1956 #define READYUV422_AVX2 \ 1957 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1990 1958 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1991 1959 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ … … 1999 1967 2000 1968 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 2001 #define READYUVA422_AVX2 2002 "vmovq " MEMACCESS([u_buf]) ",%%xmm0\n" \1969 #define READYUVA422_AVX2 \ 1970 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2003 1971 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2004 1972 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ … … 2014 1982 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" 2015 1983 2016 // Read 4 UV from 411, upsample to 16 UV.2017 #define READYUV411_AVX2 \2018 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \2019 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \2020 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \2021 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \2022 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \2023 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \2024 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \2025 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \2026 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \2027 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \2028 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"2029 2030 1984 // Read 8 UV from NV12, upsample to 16 UV. 2031 #define READNV12_AVX2 2032 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0\n" \1985 #define READNV12_AVX2 \ 1986 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 2033 1987 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ 2034 1988 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ … … 2040 1994 2041 1995 // Read 8 VU from NV21, upsample to 16 UV. 2042 #define READNV21_AVX2 2043 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0\n" \1996 #define READNV21_AVX2 \ 1997 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 2044 1998 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ 2045 1999 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ … … 2051 2005 2052 2006 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 2053 #define READYUY2_AVX2 2054 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4\n" \2007 #define READYUY2_AVX2 \ 2008 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ 2055 2009 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ 2056 2010 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ … … 2059 2013 2060 2014 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2061 #define READUYVY_AVX2 2062 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4\n" \2015 #define READUYVY_AVX2 \ 2016 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 2063 2017 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 2064 2018 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ … … 2067 2021 2068 2022 #if defined(__x86_64__) 2069 #define YUVTORGB_SETUP_AVX2(yuvconstants) 2070 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8\n" \2023 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ 2024 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ 2071 2025 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ 2072 2026 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ … … 2075 2029 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ 2076 2030 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" 2077 #define YUVTORGB_AVX2(yuvconstants) \ 2078 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ 2079 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ 2080 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ 2081 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ 2082 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ 2083 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ 2084 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ 2085 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2086 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2087 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2088 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2089 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2090 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2091 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2092 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2093 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2031 2032 #define YUVTORGB_AVX2(yuvconstants) \ 2033 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ 2034 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ 2035 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ 2036 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ 2037 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ 2038 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ 2039 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ 2040 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2041 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2042 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2043 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2044 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2045 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2046 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2047 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2048 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2049 2094 2050 #define YUVTORGB_REGS_AVX2 \ 2095 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 2051 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 2052 2096 2053 #else // Convert 16 pixels: 16 UV and 16 Y. 2054 2097 2055 #define YUVTORGB_SETUP_AVX2(yuvconstants) 2098 #define YUVTORGB_AVX2(yuvconstants) 2099 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2\n" \2056 #define YUVTORGB_AVX2(yuvconstants) \ 2057 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ 2100 2058 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ 2101 2059 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ … … 2120 2078 2121 2079 // Store 16 ARGB values. 2122 #define STOREARGB_AVX2 2123 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0\n" \2080 #define STOREARGB_AVX2 \ 2081 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2124 2082 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2125 2083 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ … … 2144 2102 "sub %[u_buf],%[v_buf] \n" 2145 2103 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2146 LABELALIGN 2147 "1: \n" 2104 2105 LABELALIGN 2106 "1: \n" 2148 2107 READYUV444_AVX2 2149 2108 YUVTORGB_AVX2(yuvconstants) … … 2164 2123 #endif // HAS_I444TOARGBROW_AVX2 2165 2124 2166 #if def HAS_I411TOARGBROW_AVX22125 #if defined(HAS_I422TOARGBROW_AVX2) 2167 2126 // 16 pixels 2168 // 4UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).2169 void OMITFP I4 11ToARGBRow_AVX2(const uint8* y_buf,2127 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2128 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, 2170 2129 const uint8* u_buf, 2171 2130 const uint8* v_buf, … … 2177 2136 "sub %[u_buf],%[v_buf] \n" 2178 2137 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2179 LABELALIGN 2180 "1: \n" 2181 READYUV411_AVX2 2138 2139 LABELALIGN 2140 "1: \n" 2141 READYUV422_AVX2 2182 2142 YUVTORGB_AVX2(yuvconstants) 2183 2143 STOREARGB_AVX2 2184 2144 "sub $0x10,%[width] \n" 2185 2145 "jg 1b \n" 2146 2186 2147 "vzeroupper \n" 2187 2148 : [y_buf]"+r"(y_buf), // %[y_buf] … … 2195 2156 ); 2196 2157 } 2197 #endif // HAS_I411TOARGBROW_AVX22198 2199 #if defined(HAS_I422TOARGBROW_AVX2)2200 // 16 pixels2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,2203 const uint8* u_buf,2204 const uint8* v_buf,2205 uint8* dst_argb,2206 const struct YuvConstants* yuvconstants,2207 int width) {2208 asm volatile (2209 YUVTORGB_SETUP_AVX2(yuvconstants)2210 "sub %[u_buf],%[v_buf] \n"2211 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"2212 LABELALIGN2213 "1: \n"2214 READYUV422_AVX22215 YUVTORGB_AVX2(yuvconstants)2216 STOREARGB_AVX22217 "sub $0x10,%[width] \n"2218 "jg 1b \n"2219 "vzeroupper \n"2220 : [y_buf]"+r"(y_buf), // %[y_buf]2221 [u_buf]"+r"(u_buf), // %[u_buf]2222 [v_buf]"+r"(v_buf), // %[v_buf]2223 [dst_argb]"+r"(dst_argb), // %[dst_argb]2224 [width]"+rm"(width) // %[width]2225 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]2226 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX22227 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"2228 );2229 }2230 2158 #endif // HAS_I422TOARGBROW_AVX2 2231 2159 … … 2234 2162 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2235 2163 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2236 const uint8* u_buf, 2237 const uint8* v_buf, 2238 const uint8* a_buf, 2239 uint8* dst_argb, 2240 const struct YuvConstants* yuvconstants, 2241 int width) { 2164 const uint8* u_buf, 2165 const uint8* v_buf, 2166 const uint8* a_buf, 2167 uint8* dst_argb, 2168 const struct YuvConstants* yuvconstants, 2169 int width) { 2170 // clang-format off 2242 2171 asm volatile ( 2243 2172 YUVTORGB_SETUP_AVX2(yuvconstants) 2244 2173 "sub %[u_buf],%[v_buf] \n" 2245 LABELALIGN 2246 "1: \n" 2174 2175 LABELALIGN 2176 "1: \n" 2247 2177 READYUVA422_AVX2 2248 2178 YUVTORGB_AVX2(yuvconstants) … … 2256 2186 [a_buf]"+r"(a_buf), // %[a_buf] 2257 2187 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2258 #if defined(__i386__) && defined(__pic__)2188 #if defined(__i386__) 2259 2189 [width]"+m"(width) // %[width] 2260 2190 #else … … 2265 2195 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2266 2196 ); 2197 // clang-format on 2267 2198 } 2268 2199 #endif // HAS_I422ALPHATOARGBROW_AVX2 … … 2281 2212 "sub %[u_buf],%[v_buf] \n" 2282 2213 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2283 LABELALIGN 2284 "1: \n" 2214 2215 LABELALIGN 2216 "1: \n" 2285 2217 READYUV422_AVX2 2286 2218 YUVTORGB_AVX2(yuvconstants) … … 2319 2251 const struct YuvConstants* yuvconstants, 2320 2252 int width) { 2253 // clang-format off 2321 2254 asm volatile ( 2322 2255 YUVTORGB_SETUP_AVX2(yuvconstants) 2323 2256 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2324 LABELALIGN 2325 "1: \n" 2257 2258 LABELALIGN 2259 "1: \n" 2326 2260 READNV12_AVX2 2327 2261 YUVTORGB_AVX2(yuvconstants) … … 2338 2272 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2339 2273 ); 2274 // clang-format on 2340 2275 } 2341 2276 #endif // HAS_NV12TOARGBROW_AVX2 … … 2349 2284 const struct YuvConstants* yuvconstants, 2350 2285 int width) { 2286 // clang-format off 2351 2287 asm volatile ( 2352 2288 YUVTORGB_SETUP_AVX2(yuvconstants) 2353 2289 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2354 LABELALIGN 2355 "1: \n" 2290 2291 LABELALIGN 2292 "1: \n" 2356 2293 READNV21_AVX2 2357 2294 YUVTORGB_AVX2(yuvconstants) … … 2369 2306 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2370 2307 ); 2308 // clang-format on 2371 2309 } 2372 2310 #endif // HAS_NV21TOARGBROW_AVX2 … … 2379 2317 const struct YuvConstants* yuvconstants, 2380 2318 int width) { 2319 // clang-format off 2381 2320 asm volatile ( 2382 2321 YUVTORGB_SETUP_AVX2(yuvconstants) 2383 2322 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2384 LABELALIGN 2385 "1: \n" 2323 2324 LABELALIGN 2325 "1: \n" 2386 2326 READYUY2_AVX2 2387 2327 YUVTORGB_AVX2(yuvconstants) … … 2399 2339 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2400 2340 ); 2341 // clang-format on 2401 2342 } 2402 2343 #endif // HAS_YUY2TOARGBROW_AVX2 … … 2409 2350 const struct YuvConstants* yuvconstants, 2410 2351 int width) { 2352 // clang-format off 2411 2353 asm volatile ( 2412 2354 YUVTORGB_SETUP_AVX2(yuvconstants) 2413 2355 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2414 LABELALIGN 2415 "1: \n" 2356 2357 LABELALIGN 2358 "1: \n" 2416 2359 READUYVY_AVX2 2417 2360 YUVTORGB_AVX2(yuvconstants) … … 2429 2372 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2430 2373 ); 2374 // clang-format on 2431 2375 } 2432 2376 #endif // HAS_UYVYTOARGBROW_AVX2 … … 2443 2387 "pcmpeqb %%xmm4,%%xmm4 \n" 2444 2388 "pslld $0x18,%%xmm4 \n" 2445 LABELALIGN 2446 "1: \n" 2389 2390 LABELALIGN 2391 "1: \n" 2447 2392 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2448 2393 "movq " MEMACCESS(0) ",%%xmm0 \n" … … 2492 2437 2493 2438 LABELALIGN 2494 "1:\n"2439 "1: \n" 2495 2440 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 2496 2441 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" … … 2526 2471 #ifdef HAS_MIRRORROW_SSSE3 2527 2472 // Shuffle table for reversing the bytes. 2528 static uvec8 kShuffleMirror = { 2529 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2530 }; 2473 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 2474 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 2531 2475 2532 2476 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { … … 2534 2478 asm volatile ( 2535 2479 "movdqa %3,%%xmm5 \n" 2536 LABELALIGN 2537 "1: \n" 2480 2481 LABELALIGN 2482 "1: \n" 2538 2483 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 2539 2484 "pshufb %%xmm5,%%xmm0 \n" … … 2557 2502 asm volatile ( 2558 2503 "vbroadcastf128 %3,%%ymm5 \n" 2559 LABELALIGN 2560 "1: \n" 2504 2505 LABELALIGN 2506 "1: \n" 2561 2507 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 2562 2508 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" … … 2579 2525 #ifdef HAS_MIRRORUVROW_SSSE3 2580 2526 // Shuffle table for reversing the bytes of UV channels. 2581 static uvec8 kShuffleMirrorUV = { 2582 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2583 }; 2584 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2527 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 2528 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; 2529 void MirrorUVRow_SSSE3(const uint8* src, 2530 uint8* dst_u, 2531 uint8* dst_v, 2585 2532 int width) { 2586 2533 intptr_t temp_width = (intptr_t)(width); … … 2589 2536 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 2590 2537 "sub %1,%2 \n" 2591 LABELALIGN 2592 "1: \n" 2538 2539 LABELALIGN 2540 "1: \n" 2593 2541 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2594 2542 "lea " MEMLEA(-0x10,0) ",%0 \n" … … 2616 2564 asm volatile ( 2617 2565 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" 2618 LABELALIGN 2619 "1: \n" 2566 2567 LABELALIGN 2568 "1: \n" 2620 2569 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2621 2570 "pshufd $0x1b,%%xmm0,%%xmm0 \n" … … 2637 2586 #ifdef HAS_ARGBMIRRORROW_AVX2 2638 2587 // Shuffle table for reversing the bytes. 2639 static const ulvec32 kARGBShuffleMirror_AVX2 = { 2640 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2641 }; 2588 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 2642 2589 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2643 2590 intptr_t temp_width = (intptr_t)(width); 2644 2591 asm volatile ( 2645 2592 "vmovdqu %3,%%ymm5 \n" 2646 LABELALIGN 2647 "1: \n" 2593 2594 LABELALIGN 2595 "1: \n" 2648 2596 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 2649 2597 "vmovdqu %%ymm0," MEMACCESS(1) " \n" … … 2663 2611 2664 2612 #ifdef HAS_SPLITUVROW_AVX2 2665 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2613 void SplitUVRow_AVX2(const uint8* src_uv, 2614 uint8* dst_u, 2615 uint8* dst_v, 2666 2616 int width) { 2667 2617 asm volatile ( 2668 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2669 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2670 "sub %1,%2 \n" 2671 LABELALIGN 2672 "1: \n" 2673 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2674 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2675 "lea " MEMLEA(0x40,0) ",%0 \n" 2676 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" 2677 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" 2678 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 2679 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 2680 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 2681 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" 2682 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2683 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2684 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2685 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) 2686 "lea " MEMLEA(0x20,1) ",%1 \n" 2687 "sub $0x20,%3 \n" 2688 "jg 1b \n" 2689 "vzeroupper \n" 2618 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2619 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2620 "sub %1,%2 \n" 2621 2622 LABELALIGN 2623 "1: \n" 2624 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2625 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2626 "lea " MEMLEA(0x40,0) ",%0 \n" 2627 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" 2628 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" 2629 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 2630 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 2631 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 2632 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" 2633 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2634 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2635 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2636 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) 2637 "lea " MEMLEA(0x20,1) ",%1 \n" 2638 "sub $0x20,%3 \n" 2639 "jg 1b \n" 2640 "vzeroupper \n" 2690 2641 : "+r"(src_uv), // %0 2691 2642 "+r"(dst_u), // %1 … … 2700 2651 2701 2652 #ifdef HAS_SPLITUVROW_SSE2 2702 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 2653 void SplitUVRow_SSE2(const uint8* src_uv, 2654 uint8* dst_u, 2655 uint8* dst_v, 2703 2656 int width) { 2704 2657 asm volatile ( 2705 "pcmpeqb %%xmm5,%%xmm5 \n" 2706 "psrlw $0x8,%%xmm5 \n" 2707 "sub %1,%2 \n" 2708 LABELALIGN 2709 "1: \n" 2710 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2711 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2712 "lea " MEMLEA(0x20,0) ",%0 \n" 2713 "movdqa %%xmm0,%%xmm2 \n" 2714 "movdqa %%xmm1,%%xmm3 \n" 2715 "pand %%xmm5,%%xmm0 \n" 2716 "pand %%xmm5,%%xmm1 \n" 2717 "packuswb %%xmm1,%%xmm0 \n" 2718 "psrlw $0x8,%%xmm2 \n" 2719 "psrlw $0x8,%%xmm3 \n" 2720 "packuswb %%xmm3,%%xmm2 \n" 2721 "movdqu %%xmm0," MEMACCESS(1) " \n" 2722 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 2723 "lea " MEMLEA(0x10,1) ",%1 \n" 2724 "sub $0x10,%3 \n" 2725 "jg 1b \n" 2658 "pcmpeqb %%xmm5,%%xmm5 \n" 2659 "psrlw $0x8,%%xmm5 \n" 2660 "sub %1,%2 \n" 2661 2662 LABELALIGN 2663 "1: \n" 2664 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2666 "lea " MEMLEA(0x20,0) ",%0 \n" 2667 "movdqa %%xmm0,%%xmm2 \n" 2668 "movdqa %%xmm1,%%xmm3 \n" 2669 "pand %%xmm5,%%xmm0 \n" 2670 "pand %%xmm5,%%xmm1 \n" 2671 "packuswb %%xmm1,%%xmm0 \n" 2672 "psrlw $0x8,%%xmm2 \n" 2673 "psrlw $0x8,%%xmm3 \n" 2674 "packuswb %%xmm3,%%xmm2 \n" 2675 "movdqu %%xmm0," MEMACCESS(1) " \n" 2676 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 2677 "lea " MEMLEA(0x10,1) ",%1 \n" 2678 "sub $0x10,%3 \n" 2679 "jg 1b \n" 2726 2680 : "+r"(src_uv), // %0 2727 2681 "+r"(dst_u), // %1 … … 2736 2690 2737 2691 #ifdef HAS_MERGEUVROW_AVX2 2738 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2692 void MergeUVRow_AVX2(const uint8* src_u, 2693 const uint8* src_v, 2694 uint8* dst_uv, 2739 2695 int width) { 2740 2696 asm volatile ( 2741 "sub %0,%1 \n" 2742 LABELALIGN 2743 "1: \n" 2744 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2745 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 2746 "lea " MEMLEA(0x20,0) ",%0 \n" 2747 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" 2748 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" 2749 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" 2697 "sub %0,%1 \n" 2698 2699 LABELALIGN 2700 "1: \n" 2701 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2702 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 2703 "lea " MEMLEA(0x20,0) ",%0 \n" 2704 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" 2705 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" 2706 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" 2750 2707 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" 2751 2708 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" 2752 2709 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" 2753 "lea " MEMLEA(0x40,2) ",%2 2754 "sub $0x20,%3 2755 "jg 1b 2756 "vzeroupper 2710 "lea " MEMLEA(0x40,2) ",%2 \n" 2711 "sub $0x20,%3 \n" 2712 "jg 1b \n" 2713 "vzeroupper \n" 2757 2714 : "+r"(src_u), // %0 2758 2715 "+r"(src_v), // %1 … … 2767 2724 2768 2725 #ifdef HAS_MERGEUVROW_SSE2 2769 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2726 void MergeUVRow_SSE2(const uint8* src_u, 2727 const uint8* src_v, 2728 uint8* dst_uv, 2770 2729 int width) { 2771 2730 asm volatile ( 2772 "sub %0,%1 \n" 2773 LABELALIGN 2774 "1: \n" 2775 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2776 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 2777 "lea " MEMLEA(0x10,0) ",%0 \n" 2778 "movdqa %%xmm0,%%xmm2 \n" 2779 "punpcklbw %%xmm1,%%xmm0 \n" 2780 "punpckhbw %%xmm1,%%xmm2 \n" 2781 "movdqu %%xmm0," MEMACCESS(2) " \n" 2782 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 2783 "lea " MEMLEA(0x20,2) ",%2 \n" 2784 "sub $0x10,%3 \n" 2785 "jg 1b \n" 2731 "sub %0,%1 \n" 2732 2733 LABELALIGN 2734 "1: \n" 2735 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2736 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 2737 "lea " MEMLEA(0x10,0) ",%0 \n" 2738 "movdqa %%xmm0,%%xmm2 \n" 2739 "punpcklbw %%xmm1,%%xmm0 \n" 2740 "punpckhbw %%xmm1,%%xmm2 \n" 2741 "movdqu %%xmm0," MEMACCESS(2) " \n" 2742 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 2743 "lea " MEMLEA(0x20,2) ",%2 \n" 2744 "sub $0x10,%3 \n" 2745 "jg 1b \n" 2786 2746 : "+r"(src_u), // %0 2787 2747 "+r"(src_v), // %1 … … 2802 2762 "test $0xf,%1 \n" 2803 2763 "jne 2f \n" 2804 LABELALIGN 2805 "1: \n" 2764 2765 LABELALIGN 2766 "1: \n" 2806 2767 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 2807 2768 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 2813 2774 "jg 1b \n" 2814 2775 "jmp 9f \n" 2776 2815 2777 LABELALIGN 2816 2778 "2: \n" … … 2838 2800 asm volatile ( 2839 2801 LABELALIGN 2840 "1:\n"2802 "1: \n" 2841 2803 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2842 2804 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 2861 2823 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 2862 2824 size_t width_tmp = (size_t)(width); 2863 asm volatile ( 2864 "rep movsb " MEMMOVESTRING(0,1) " \n" 2865 : "+S"(src), // %0 2866 "+D"(dst), // %1 2867 "+c"(width_tmp) // %2 2868 : 2869 : "memory", "cc" 2870 ); 2825 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" 2826 : "+S"(src), // %0 2827 "+D"(dst), // %1 2828 "+c"(width_tmp) // %2 2829 : 2830 : "memory", "cc"); 2871 2831 } 2872 2832 #endif // HAS_COPYROW_ERMS … … 2880 2840 "pcmpeqb %%xmm1,%%xmm1 \n" 2881 2841 "psrld $0x8,%%xmm1 \n" 2882 LABELALIGN 2883 "1: \n" 2842 2843 LABELALIGN 2844 "1: \n" 2884 2845 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 2885 2846 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" … … 2914 2875 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 2915 2876 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 2916 LABELALIGN 2917 "1: \n" 2877 2878 LABELALIGN 2879 "1: \n" 2918 2880 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 2919 2881 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" … … 2940 2902 // width in pixels 2941 2903 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { 2942 asm volatile (2943 LABELALIGN 2944 "1:\n"2904 asm volatile ( 2905 LABELALIGN 2906 "1: \n" 2945 2907 "movdqu " MEMACCESS(0) ", %%xmm0 \n" 2946 2908 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" … … 2964 2926 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 2965 2927 2928 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 2929 static const uvec8 kShuffleAlphaShort_AVX2 = { 2930 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, 2931 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; 2932 2933 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { 2934 asm volatile ( 2935 "vmovdqa %3,%%ymm4 \n" 2936 "vbroadcastf128 %4,%%ymm5 \n" 2937 2938 LABELALIGN 2939 "1: \n" 2940 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" 2941 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" 2942 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 2943 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 2944 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" 2945 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" 2946 "lea " MEMLEA(0x80, 0) ", %0 \n" 2947 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates 2948 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" 2949 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" 2950 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates 2951 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 2952 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. 2953 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2954 "lea " MEMLEA(0x20,1) ",%1 \n" 2955 "sub $0x20, %2 \n" 2956 "jg 1b \n" 2957 "vzeroupper \n" 2958 : "+r"(src_argb), // %0 2959 "+r"(dst_a), // %1 2960 "+rm"(width) // %2 2961 : "m"(kPermdARGBToY_AVX), // %3 2962 "m"(kShuffleAlphaShort_AVX2) // %4 2963 : "memory", "cc" 2964 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2965 ); 2966 } 2967 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 2968 2966 2969 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 2967 2970 // width in pixels … … 2972 2975 "pcmpeqb %%xmm1,%%xmm1 \n" 2973 2976 "psrld $0x8,%%xmm1 \n" 2974 LABELALIGN 2975 "1: \n" 2977 2978 LABELALIGN 2979 "1: \n" 2976 2980 "movq " MEMACCESS(0) ",%%xmm2 \n" 2977 2981 "lea " MEMLEA(0x8,0) ",%0 \n" … … 3008 3012 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 3009 3013 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 3010 LABELALIGN 3011 "1: \n" 3014 3015 LABELALIGN 3016 "1: \n" 3012 3017 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" 3013 3018 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" … … 3037 3042 size_t width_tmp = (size_t)(width >> 2); 3038 3043 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. 3039 asm volatile ( 3040 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3041 : "+D"(dst), // %0 3042 "+c"(width_tmp) // %1 3043 : "a"(v32) // %2 3044 : "memory", "cc"); 3044 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" 3045 : "+D"(dst), // %0 3046 "+c"(width_tmp) // %1 3047 : "a"(v32) // %2 3048 : "memory", "cc"); 3045 3049 } 3046 3050 3047 3051 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { 3048 3052 size_t width_tmp = (size_t)(width); 3049 asm volatile ( 3050 "rep stosb " MEMSTORESTRING(al,0) " \n" 3051 : "+D"(dst), // %0 3052 "+c"(width_tmp) // %1 3053 : "a"(v8) // %2 3054 : "memory", "cc"); 3053 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" 3054 : "+D"(dst), // %0 3055 "+c"(width_tmp) // %1 3056 : "a"(v8) // %2 3057 : "memory", "cc"); 3055 3058 } 3056 3059 3057 3060 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { 3058 3061 size_t width_tmp = (size_t)(width); 3059 asm volatile ( 3060 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3061 : "+D"(dst_argb), // %0 3062 "+c"(width_tmp) // %1 3063 : "a"(v32) // %2 3064 : "memory", "cc"); 3062 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" 3063 : "+D"(dst_argb), // %0 3064 "+c"(width_tmp) // %1 3065 : "a"(v32) // %2 3066 : "memory", "cc"); 3065 3067 } 3066 3068 #endif // HAS_SETROW_X86 … … 3071 3073 "pcmpeqb %%xmm5,%%xmm5 \n" 3072 3074 "psrlw $0x8,%%xmm5 \n" 3073 LABELALIGN 3074 "1: \n" 3075 3076 LABELALIGN 3077 "1: \n" 3075 3078 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3076 3079 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3092 3095 } 3093 3096 3094 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3095 uint8* dst_u, uint8* dst_v, int width) { 3097 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, 3098 int stride_yuy2, 3099 uint8* dst_u, 3100 uint8* dst_v, 3101 int width) { 3096 3102 asm volatile ( 3097 3103 "pcmpeqb %%xmm5,%%xmm5 \n" 3098 3104 "psrlw $0x8,%%xmm5 \n" 3099 3105 "sub %1,%2 \n" 3100 LABELALIGN 3101 "1: \n" 3106 3107 LABELALIGN 3108 "1: \n" 3102 3109 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3103 3110 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3131 3138 3132 3139 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3133 uint8* dst_u, uint8* dst_v, int width) { 3140 uint8* dst_u, 3141 uint8* dst_v, 3142 int width) { 3134 3143 asm volatile ( 3135 3144 "pcmpeqb %%xmm5,%%xmm5 \n" 3136 3145 "psrlw $0x8,%%xmm5 \n" 3137 3146 "sub %1,%2 \n" 3138 LABELALIGN 3139 "1: \n" 3147 3148 LABELALIGN 3149 "1: \n" 3140 3150 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3141 3151 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3167 3177 asm volatile ( 3168 3178 LABELALIGN 3169 "1:\n"3179 "1: \n" 3170 3180 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3171 3181 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3187 3197 } 3188 3198 3189 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3190 uint8* dst_u, uint8* dst_v, int width) { 3199 void UYVYToUVRow_SSE2(const uint8* src_uyvy, 3200 int stride_uyvy, 3201 uint8* dst_u, 3202 uint8* dst_v, 3203 int width) { 3191 3204 asm volatile ( 3192 3205 "pcmpeqb %%xmm5,%%xmm5 \n" 3193 3206 "psrlw $0x8,%%xmm5 \n" 3194 3207 "sub %1,%2 \n" 3195 LABELALIGN 3196 "1: \n" 3208 3209 LABELALIGN 3210 "1: \n" 3197 3211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3198 3212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3226 3240 3227 3241 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3228 uint8* dst_u, uint8* dst_v, int width) { 3242 uint8* dst_u, 3243 uint8* dst_v, 3244 int width) { 3229 3245 asm volatile ( 3230 3246 "pcmpeqb %%xmm5,%%xmm5 \n" 3231 3247 "psrlw $0x8,%%xmm5 \n" 3232 3248 "sub %1,%2 \n" 3233 LABELALIGN 3234 "1: \n" 3249 3250 LABELALIGN 3251 "1: \n" 3235 3252 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3236 3253 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3265 3282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3266 3283 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3267 LABELALIGN 3268 "1: \n" 3284 3285 LABELALIGN 3286 "1: \n" 3269 3287 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3270 3288 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3288 3306 } 3289 3307 3290 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3291 uint8* dst_u, uint8* dst_v, int width) { 3308 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, 3309 int stride_yuy2, 3310 uint8* dst_u, 3311 uint8* dst_v, 3312 int width) { 3292 3313 asm volatile ( 3293 3314 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3294 3315 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3295 3316 "sub %1,%2 \n" 3296 LABELALIGN 3297 "1: \n" 3317 3318 LABELALIGN 3319 "1: \n" 3298 3320 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3299 3321 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3328 3350 3329 3351 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3330 uint8* dst_u, uint8* dst_v, int width) { 3352 uint8* dst_u, 3353 uint8* dst_v, 3354 int width) { 3331 3355 asm volatile ( 3332 3356 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3333 3357 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3334 3358 "sub %1,%2 \n" 3335 LABELALIGN 3336 "1: \n" 3359 3360 LABELALIGN 3361 "1: \n" 3337 3362 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3338 3363 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3367 3392 asm volatile ( 3368 3393 LABELALIGN 3369 "1:\n"3394 "1: \n" 3370 3395 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3371 3396 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3388 3413 ); 3389 3414 } 3390 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3391 uint8* dst_u, uint8* dst_v, int width) { 3415 void UYVYToUVRow_AVX2(const uint8* src_uyvy, 3416 int stride_uyvy, 3417 uint8* dst_u, 3418 uint8* dst_v, 3419 int width) { 3392 3420 asm volatile ( 3393 3421 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" … … 3396 3424 3397 3425 LABELALIGN 3398 "1:\n"3426 "1: \n" 3399 3427 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3400 3428 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3429 3457 3430 3458 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3431 uint8* dst_u, uint8* dst_v, int width) { 3459 uint8* dst_u, 3460 uint8* dst_v, 3461 int width) { 3432 3462 asm volatile ( 3433 3463 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3434 3464 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3435 3465 "sub %1,%2 \n" 3436 LABELALIGN 3437 "1: \n" 3466 3467 LABELALIGN 3468 "1: \n" 3438 3469 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3439 3470 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 3468 3499 #ifdef HAS_ARGBBLENDROW_SSSE3 3469 3500 // Shuffle table for isolating alpha. 3470 static uvec8 kShuffleAlpha = { 3471 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3472 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 3473 }; 3501 static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3502 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; 3474 3503 3475 3504 // Blend 8 pixels at a time 3476 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 3477 uint8* dst_argb, int width) { 3505 void ARGBBlendRow_SSSE3(const uint8* src_argb0, 3506 const uint8* src_argb1, 3507 uint8* dst_argb, 3508 int width) { 3478 3509 asm volatile ( 3479 3510 "pcmpeqb %%xmm7,%%xmm7 \n" … … 3560 3591 // signed version of math 3561 3592 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3562 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 3563 const uint8* alpha, uint8* dst, int width) { 3564 asm volatile ( 3565 "pcmpeqb %%xmm5,%%xmm5 \n" 3566 "psllw $0x8,%%xmm5 \n" 3567 "mov $0x80808080,%%eax \n" 3568 "movd %%eax,%%xmm6 \n" 3569 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3570 "mov $0x807f807f,%%eax \n" 3571 "movd %%eax,%%xmm7 \n" 3572 "pshufd $0x0,%%xmm7,%%xmm7 \n" 3573 "sub %2,%0 \n" 3574 "sub %2,%1 \n" 3575 "sub %2,%3 \n" 3576 3577 // 8 pixel loop. 3578 LABELALIGN 3579 "1: \n" 3580 "movq (%2),%%xmm0 \n" 3581 "punpcklbw %%xmm0,%%xmm0 \n" 3582 "pxor %%xmm5,%%xmm0 \n" 3583 "movq (%0,%2,1),%%xmm1 \n" 3584 "movq (%1,%2,1),%%xmm2 \n" 3585 "punpcklbw %%xmm2,%%xmm1 \n" 3586 "psubb %%xmm6,%%xmm1 \n" 3587 "pmaddubsw %%xmm1,%%xmm0 \n" 3588 "paddw %%xmm7,%%xmm0 \n" 3589 "psrlw $0x8,%%xmm0 \n" 3590 "packuswb %%xmm0,%%xmm0 \n" 3591 "movq %%xmm0,(%3,%2,1) \n" 3592 "lea 0x8(%2),%2 \n" 3593 "sub $0x8,%4 \n" 3594 "jg 1b \n" 3595 : "+r"(src0), // %0 3596 "+r"(src1), // %1 3597 "+r"(alpha), // %2 3598 "+r"(dst), // %3 3599 "+rm"(width) // %4 3600 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3601 ); 3593 void BlendPlaneRow_SSSE3(const uint8* src0, 3594 const uint8* src1, 3595 const uint8* alpha, 3596 uint8* dst, 3597 int width) { 3598 asm volatile( 3599 "pcmpeqb %%xmm5,%%xmm5 \n" 3600 "psllw $0x8,%%xmm5 \n" 3601 "mov $0x80808080,%%eax \n" 3602 "movd %%eax,%%xmm6 \n" 3603 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3604 "mov $0x807f807f,%%eax \n" 3605 "movd %%eax,%%xmm7 \n" 3606 "pshufd $0x0,%%xmm7,%%xmm7 \n" 3607 "sub %2,%0 \n" 3608 "sub %2,%1 \n" 3609 "sub %2,%3 \n" 3610 3611 // 8 pixel loop. 3612 LABELALIGN 3613 "1: \n" 3614 "movq (%2),%%xmm0 \n" 3615 "punpcklbw %%xmm0,%%xmm0 \n" 3616 "pxor %%xmm5,%%xmm0 \n" 3617 "movq (%0,%2,1),%%xmm1 \n" 3618 "movq (%1,%2,1),%%xmm2 \n" 3619 "punpcklbw %%xmm2,%%xmm1 \n" 3620 "psubb %%xmm6,%%xmm1 \n" 3621 "pmaddubsw %%xmm1,%%xmm0 \n" 3622 "paddw %%xmm7,%%xmm0 \n" 3623 "psrlw $0x8,%%xmm0 \n" 3624 "packuswb %%xmm0,%%xmm0 \n" 3625 "movq %%xmm0,(%3,%2,1) \n" 3626 "lea 0x8(%2),%2 \n" 3627 "sub $0x8,%4 \n" 3628 "jg 1b \n" 3629 : "+r"(src0), // %0 3630 "+r"(src1), // %1 3631 "+r"(alpha), // %2 3632 "+r"(dst), // %3 3633 "+rm"(width) // %4 3634 ::"memory", 3635 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); 3602 3636 } 3603 3637 #endif // HAS_BLENDPLANEROW_SSSE3 … … 3609 3643 // signed version of math 3610 3644 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3611 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 3612 const uint8* alpha, uint8* dst, int width) { 3613 asm volatile ( 3614 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3615 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3616 "mov $0x80808080,%%eax \n" 3617 "vmovd %%eax,%%xmm6 \n" 3618 "vbroadcastss %%xmm6,%%ymm6 \n" 3619 "mov $0x807f807f,%%eax \n" 3620 "vmovd %%eax,%%xmm7 \n" 3621 "vbroadcastss %%xmm7,%%ymm7 \n" 3622 "sub %2,%0 \n" 3623 "sub %2,%1 \n" 3624 "sub %2,%3 \n" 3625 3626 // 32 pixel loop. 3627 LABELALIGN 3628 "1: \n" 3629 "vmovdqu (%2),%%ymm0 \n" 3630 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" 3631 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3632 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" 3633 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3634 "vmovdqu (%0,%2,1),%%ymm1 \n" 3635 "vmovdqu (%1,%2,1),%%ymm2 \n" 3636 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" 3637 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3638 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" 3639 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3640 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 3641 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3642 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" 3643 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3644 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" 3645 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3646 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" 3647 "vmovdqu %%ymm0,(%3,%2,1) \n" 3648 "lea 0x20(%2),%2 \n" 3649 "sub $0x20,%4 \n" 3650 "jg 1b \n" 3651 "vzeroupper \n" 3652 : "+r"(src0), // %0 3653 "+r"(src1), // %1 3654 "+r"(alpha), // %2 3655 "+r"(dst), // %3 3656 "+rm"(width) // %4 3657 :: "memory", "cc", "eax", 3658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3659 ); 3645 void BlendPlaneRow_AVX2(const uint8* src0, 3646 const uint8* src1, 3647 const uint8* alpha, 3648 uint8* dst, 3649 int width) { 3650 asm volatile( 3651 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3652 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3653 "mov $0x80808080,%%eax \n" 3654 "vmovd %%eax,%%xmm6 \n" 3655 "vbroadcastss %%xmm6,%%ymm6 \n" 3656 "mov $0x807f807f,%%eax \n" 3657 "vmovd %%eax,%%xmm7 \n" 3658 "vbroadcastss %%xmm7,%%ymm7 \n" 3659 "sub %2,%0 \n" 3660 "sub %2,%1 \n" 3661 "sub %2,%3 \n" 3662 3663 // 32 pixel loop. 3664 LABELALIGN 3665 "1: \n" 3666 "vmovdqu (%2),%%ymm0 \n" 3667 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" 3668 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3669 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" 3670 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3671 "vmovdqu (%0,%2,1),%%ymm1 \n" 3672 "vmovdqu (%1,%2,1),%%ymm2 \n" 3673 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" 3674 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3675 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" 3676 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3677 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 3678 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3679 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" 3680 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3681 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" 3682 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3683 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" 3684 "vmovdqu %%ymm0,(%3,%2,1) \n" 3685 "lea 0x20(%2),%2 \n" 3686 "sub $0x20,%4 \n" 3687 "jg 1b \n" 3688 "vzeroupper \n" 3689 : "+r"(src0), // %0 3690 "+r"(src1), // %1 3691 "+r"(alpha), // %2 3692 "+r"(dst), // %3 3693 "+rm"(width) // %4 3694 ::"memory", 3695 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 3696 "xmm7"); 3660 3697 } 3661 3698 #endif // HAS_BLENDPLANEROW_AVX2 … … 3663 3700 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3664 3701 // Shuffle table duplicating alpha 3665 static uvec8 kShuffleAlpha0 = { 3666 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3667 }; 3668 static uvec8 kShuffleAlpha1 = { 3669 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3670 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u 3671 }; 3702 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 3703 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; 3704 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3705 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; 3672 3706 // Attenuate 4 pixels at a time. 3673 3707 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { … … 3680 3714 // 4 pixel loop. 3681 3715 LABELALIGN 3682 "1:\n"3716 "1: \n" 3683 3717 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3684 3718 "pshufb %%xmm4,%%xmm0 \n" … … 3715 3749 #ifdef HAS_ARGBATTENUATEROW_AVX2 3716 3750 // Shuffle table duplicating alpha. 3717 static const uvec8 kShuffleAlpha_AVX2 = { 3718 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u3719 };3751 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 3752 128u, 128u, 14u, 15u, 14u, 15u, 3753 14u, 15u, 128u, 128u}; 3720 3754 // Attenuate 8 pixels at a time. 3721 3755 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { … … 3728 3762 // 8 pixel loop. 3729 3763 LABELALIGN 3730 "1:\n"3764 "1: \n" 3731 3765 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" 3732 3766 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" … … 3758 3792 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 3759 3793 // Unattenuate 4 pixels at a time. 3760 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 3794 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, 3795 uint8* dst_argb, 3761 3796 int width) { 3762 3797 uintptr_t alpha; … … 3764 3799 // 4 pixel loop. 3765 3800 LABELALIGN 3766 "1:\n"3801 "1: \n" 3767 3802 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3768 3803 "movzb " MEMACCESS2(0x03,0) ",%3 \n" … … 3805 3840 // Shuffle table duplicating alpha. 3806 3841 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 3807 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 3808 }; 3842 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; 3809 3843 // Unattenuate 8 pixels at a time. 3810 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 3844 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, 3845 uint8* dst_argb, 3811 3846 int width) { 3812 3847 uintptr_t alpha; … … 3817 3852 // 8 pixel loop. 3818 3853 LABELALIGN 3819 "1:\n"3854 "1: \n" 3820 3855 // replace VPGATHER 3821 3856 "movzb " MEMACCESS2(0x03,0) ",%3 \n" … … 3880 3915 // 8 pixel loop. 3881 3916 LABELALIGN 3882 "1:\n"3917 "1: \n" 3883 3918 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3884 3919 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 3923 3958 // r = (r * 50 + g * 98 + b * 24) >> 7 3924 3959 // Constant for ARGB color to sepia tone 3925 static vec8 kARGBToSepiaB = { 3926 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 3927 }; 3928 3929 static vec8 kARGBToSepiaG = { 3930 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 3931 }; 3932 3933 static vec8 kARGBToSepiaR = { 3934 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 3935 }; 3960 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, 3961 17, 68, 35, 0, 17, 68, 35, 0}; 3962 3963 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, 3964 22, 88, 45, 0, 22, 88, 45, 0}; 3965 3966 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 3967 24, 98, 50, 0, 24, 98, 50, 0}; 3936 3968 3937 3969 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. … … 3944 3976 // 8 pixel loop. 3945 3977 LABELALIGN 3946 "1:\n"3978 "1: \n" 3947 3979 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3948 3980 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" … … 3996 4028 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 3997 4029 // Same as Sepia except matrix is provided. 3998 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 3999 const int8* matrix_argb, int width) { 4030 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, 4031 uint8* dst_argb, 4032 const int8* matrix_argb, 4033 int width) { 4000 4034 asm volatile ( 4001 4035 "movdqu " MEMACCESS(3) ",%%xmm5 \n" … … 4007 4041 // 8 pixel loop. 4008 4042 LABELALIGN 4009 "1:\n"4043 "1: \n" 4010 4044 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4011 4045 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" … … 4059 4093 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4060 4094 // Quantize 4 ARGB pixels (16 bytes). 4061 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4062 int interval_offset, int width) { 4095 void ARGBQuantizeRow_SSE2(uint8* dst_argb, 4096 int scale, 4097 int interval_size, 4098 int interval_offset, 4099 int width) { 4063 4100 asm volatile ( 4064 4101 "movd %2,%%xmm2 \n" … … 4077 4114 // 4 pixel loop. 4078 4115 LABELALIGN 4079 "1:\n"4116 "1: \n" 4080 4117 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4081 4118 "punpcklbw %%xmm5,%%xmm0 \n" … … 4109 4146 #ifdef HAS_ARGBSHADEROW_SSE2 4110 4147 // Shade 4 pixels at a time by specified value. 4111 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4148 void ARGBShadeRow_SSE2(const uint8* src_argb, 4149 uint8* dst_argb, 4150 int width, 4112 4151 uint32 value) { 4113 4152 asm volatile ( … … 4118 4157 // 4 pixel loop. 4119 4158 LABELALIGN 4120 "1:\n"4159 "1: \n" 4121 4160 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4122 4161 "lea " MEMLEA(0x10,0) ",%0 \n" … … 4145 4184 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4146 4185 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4147 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4148 uint8* dst_argb, int width) { 4149 asm volatile ( 4150 "pxor %%xmm5,%%xmm5 \n" 4186 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, 4187 const uint8* src_argb1, 4188 uint8* dst_argb, 4189 int width) { 4190 asm volatile ( 4191 "pxor %%xmm5,%%xmm5 \n" 4151 4192 4152 4193 // 4 pixel loop. 4153 4194 LABELALIGN 4154 "1:\n"4195 "1: \n" 4155 4196 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4156 4197 "lea " MEMLEA(0x10,0) ",%0 \n" … … 4183 4224 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4184 4225 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4185 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4186 uint8* dst_argb, int width) { 4226 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, 4227 const uint8* src_argb1, 4228 uint8* dst_argb, 4229 int width) { 4187 4230 asm volatile ( 4188 4231 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" … … 4190 4233 // 4 pixel loop. 4191 4234 LABELALIGN 4192 "1:\n"4235 "1: \n" 4193 4236 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 4194 4237 "lea " MEMLEA(0x20,0) ",%0 \n" … … 4222 4265 #ifdef HAS_ARGBADDROW_SSE2 4223 4266 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4224 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4225 uint8* dst_argb, int width) { 4267 void ARGBAddRow_SSE2(const uint8* src_argb0, 4268 const uint8* src_argb1, 4269 uint8* dst_argb, 4270 int width) { 4226 4271 asm volatile ( 4227 4272 // 4 pixel loop. 4228 4273 LABELALIGN 4229 "1:\n"4274 "1: \n" 4230 4275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4231 4276 "lea " MEMLEA(0x10,0) ",%0 \n" … … 4250 4295 #ifdef HAS_ARGBADDROW_AVX2 4251 4296 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4252 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4253 uint8* dst_argb, int width) { 4297 void ARGBAddRow_AVX2(const uint8* src_argb0, 4298 const uint8* src_argb1, 4299 uint8* dst_argb, 4300 int width) { 4254 4301 asm volatile ( 4255 4302 // 4 pixel loop. 4256 4303 LABELALIGN 4257 "1:\n"4304 "1: \n" 4258 4305 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4259 4306 "lea " MEMLEA(0x20,0) ",%0 \n" … … 4278 4325 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4279 4326 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. 4280 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4281 uint8* dst_argb, int width) { 4327 void ARGBSubtractRow_SSE2(const uint8* src_argb0, 4328 const uint8* src_argb1, 4329 uint8* dst_argb, 4330 int width) { 4282 4331 asm volatile ( 4283 4332 // 4 pixel loop. 4284 4333 LABELALIGN 4285 "1:\n"4334 "1: \n" 4286 4335 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4287 4336 "lea " MEMLEA(0x10,0) ",%0 \n" … … 4306 4355 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4307 4356 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 4308 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4309 uint8* dst_argb, int width) { 4357 void ARGBSubtractRow_AVX2(const uint8* src_argb0, 4358 const uint8* src_argb1, 4359 uint8* dst_argb, 4360 int width) { 4310 4361 asm volatile ( 4311 4362 // 4 pixel loop. 4312 4363 LABELALIGN 4313 "1:\n"4364 "1: \n" 4314 4365 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4315 4366 "lea " MEMLEA(0x20,0) ",%0 \n" … … 4319 4370 "lea " MEMLEA(0x20,2) ",%2 \n" 4320 4371 "sub $0x8,%3 \n" 4321 "jg 1b\n"4372 "jg 1b \n" 4322 4373 "vzeroupper \n" 4323 4374 : "+r"(src_argb0), // %0 … … 4337 4388 // -2 0 2 4338 4389 // -1 0 1 4339 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4340 const uint8* src_y2, uint8* dst_sobelx, int width) { 4390 void SobelXRow_SSE2(const uint8* src_y0, 4391 const uint8* src_y1, 4392 const uint8* src_y2, 4393 uint8* dst_sobelx, 4394 int width) { 4341 4395 asm volatile ( 4342 4396 "sub %0,%1 \n" … … 4347 4401 // 8 pixel loop. 4348 4402 LABELALIGN 4349 "1:\n"4403 "1: \n" 4350 4404 "movq " MEMACCESS(0) ",%%xmm0 \n" 4351 4405 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" … … 4391 4445 // 0 0 0 4392 4446 // 1 2 1 4393 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4394 uint8* dst_sobely, int width) { 4447 void SobelYRow_SSE2(const uint8* src_y0, 4448 const uint8* src_y1, 4449 uint8* dst_sobely, 4450 int width) { 4395 4451 asm volatile ( 4396 4452 "sub %0,%1 \n" … … 4400 4456 // 8 pixel loop. 4401 4457 LABELALIGN 4402 "1:\n"4458 "1: \n" 4403 4459 "movq " MEMACCESS(0) ",%%xmm0 \n" 4404 4460 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 … … 4444 4500 // G = Sobel 4445 4501 // B = Sobel 4446 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4447 uint8* dst_argb, int width) { 4502 void SobelRow_SSE2(const uint8* src_sobelx, 4503 const uint8* src_sobely, 4504 uint8* dst_argb, 4505 int width) { 4448 4506 asm volatile ( 4449 4507 "sub %0,%1 \n" … … 4453 4511 // 8 pixel loop. 4454 4512 LABELALIGN 4455 "1:\n"4513 "1: \n" 4456 4514 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4457 4515 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 … … 4491 4549 #ifdef HAS_SOBELTOPLANEROW_SSE2 4492 4550 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 4493 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4494 uint8* dst_y, int width) { 4551 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, 4552 const uint8* src_sobely, 4553 uint8* dst_y, 4554 int width) { 4495 4555 asm volatile ( 4496 4556 "sub %0,%1 \n" … … 4500 4560 // 8 pixel loop. 4501 4561 LABELALIGN 4502 "1:\n"4562 "1: \n" 4503 4563 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4504 4564 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 … … 4526 4586 // G = Sobel 4527 4587 // B = Sobel Y 4528 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4529 uint8* dst_argb, int width) { 4588 void SobelXYRow_SSE2(const uint8* src_sobelx, 4589 const uint8* src_sobely, 4590 uint8* dst_argb, 4591 int width) { 4530 4592 asm volatile ( 4531 4593 "sub %0,%1 \n" … … 4534 4596 // 8 pixel loop. 4535 4597 LABELALIGN 4536 "1:\n"4598 "1: \n" 4537 4599 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4538 4600 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 … … 4573 4635 // Creates a table of cumulative sums where each value is a sum of all values 4574 4636 // above and to the left of the value, inclusive of the value. 4575 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 4576 const int32* previous_cumsum, int width) { 4637 void ComputeCumulativeSumRow_SSE2(const uint8* row, 4638 int32* cumsum, 4639 const int32* previous_cumsum, 4640 int width) { 4577 4641 asm volatile ( 4578 4642 "pxor %%xmm0,%%xmm0 \n" … … 4583 4647 "jne 49f \n" 4584 4648 4585 // 4 pixel loop \n"4586 LABELALIGN 4587 "40:\n"4649 // 4 pixel loop. 4650 LABELALIGN 4651 "40: \n" 4588 4652 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 4589 4653 "lea " MEMLEA(0x10,0) ",%0 \n" … … 4618 4682 "jge 40b \n" 4619 4683 4620 "49:\n"4684 "49: \n" 4621 4685 "add $0x3,%3 \n" 4622 4686 "jl 19f \n" 4623 4687 4624 // 1 pixel loop \n"4625 LABELALIGN 4626 "10:\n"4688 // 1 pixel loop. 4689 LABELALIGN 4690 "10: \n" 4627 4691 "movd " MEMACCESS(0) ",%%xmm2 \n" 4628 4692 "lea " MEMLEA(0x4,0) ",%0 \n" … … 4638 4702 "jge 10b \n" 4639 4703 4640 "19:\n"4704 "19: \n" 4641 4705 : "+r"(row), // %0 4642 4706 "+r"(cumsum), // %1 … … 4651 4715 4652 4716 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 4654 int width, int area, uint8* dst, 4717 void CumulativeSumToAverageRow_SSE2(const int32* topleft, 4718 const int32* botleft, 4719 int width, 4720 int area, 4721 uint8* dst, 4655 4722 int count) { 4656 4723 asm volatile ( … … 4673 4740 "packssdw %%xmm5,%%xmm5 \n" 4674 4741 4675 // 4 pixel small loop \n"4742 // 4 pixel small loop. 4676 4743 LABELALIGN 4677 4744 "4: \n" … … 4784 4851 // Copy ARGB pixels from source image with slope to a row of destination. 4785 4852 LIBYUV_API 4786 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 4787 uint8* dst_argb, const float* src_dudv, int width) { 4853 void ARGBAffineRow_SSE2(const uint8* src_argb, 4854 int src_argb_stride, 4855 uint8* dst_argb, 4856 const float* src_dudv, 4857 int width) { 4788 4858 intptr_t src_argb_stride_temp = src_argb_stride; 4789 4859 intptr_t temp; … … 4869 4939 #ifdef HAS_INTERPOLATEROW_SSSE3 4870 4940 // Bilinear filter 16x2 -> 16x1 4871 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 4872 ptrdiff_t src_stride, int dst_width, 4941 void InterpolateRow_SSSE3(uint8* dst_ptr, 4942 const uint8* src_ptr, 4943 ptrdiff_t src_stride, 4944 int dst_width, 4873 4945 int source_y_fraction) { 4874 4946 asm volatile ( … … 4892 4964 // General purpose row blend. 4893 4965 LABELALIGN 4894 "1:\n"4966 "1: \n" 4895 4967 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4896 4968 MEMOPREG(movdqu,0x00,1,4,1,xmm2) … … 4950 5022 #ifdef HAS_INTERPOLATEROW_AVX2 4951 5023 // Bilinear filter 32x2 -> 32x1 4952 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 4953 ptrdiff_t src_stride, int dst_width, 5024 void InterpolateRow_AVX2(uint8* dst_ptr, 5025 const uint8* src_ptr, 5026 ptrdiff_t src_stride, 5027 int dst_width, 4954 5028 int source_y_fraction) { 4955 5029 asm volatile ( … … 4973 5047 // General purpose row blend. 4974 5048 LABELALIGN 4975 "1:\n"5049 "1: \n" 4976 5050 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" 4977 5051 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) … … 5026 5100 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 5027 5101 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5028 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5029 const uint8* shuffler, int width) { 5102 void ARGBShuffleRow_SSSE3(const uint8* src_argb, 5103 uint8* dst_argb, 5104 const uint8* shuffler, 5105 int width) { 5030 5106 asm volatile ( 5031 5107 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 5032 5108 LABELALIGN 5033 "1:\n"5109 "1: \n" 5034 5110 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5035 5111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" … … 5054 5130 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5055 5131 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5056 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5057 const uint8* shuffler, int width) { 5132 void ARGBShuffleRow_AVX2(const uint8* src_argb, 5133 uint8* dst_argb, 5134 const uint8* shuffler, 5135 int width) { 5058 5136 asm volatile ( 5059 5137 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5060 5138 LABELALIGN 5061 "1:\n"5139 "1: \n" 5062 5140 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5063 5141 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" … … 5083 5161 #ifdef HAS_ARGBSHUFFLEROW_SSE2 5084 5162 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5085 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5086 const uint8* shuffler, int width) { 5163 void ARGBShuffleRow_SSE2(const uint8* src_argb, 5164 uint8* dst_argb, 5165 const uint8* shuffler, 5166 int width) { 5087 5167 uintptr_t pixel_temp; 5088 5168 asm volatile ( … … 5099 5179 5100 5180 LABELALIGN 5101 "1:\n"5181 "1: \n" 5102 5182 "movzb " MEMACCESS(4) ",%2 \n" 5103 5183 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 … … 5205 5285 const uint8* src_u, 5206 5286 const uint8* src_v, 5207 uint8* dst_frame, int width) { 5208 asm volatile ( 5287 uint8* dst_frame, 5288 int width) { 5289 asm volatile ( 5209 5290 "sub %1,%2 \n" 5210 5291 LABELALIGN 5211 "1:\n"5292 "1: \n" 5212 5293 "movq " MEMACCESS(1) ",%%xmm2 \n" 5213 5294 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 … … 5240 5321 const uint8* src_u, 5241 5322 const uint8* src_v, 5242 uint8* dst_frame, int width) { 5243 asm volatile ( 5323 uint8* dst_frame, 5324 int width) { 5325 asm volatile ( 5244 5326 "sub %1,%2 \n" 5245 5327 LABELALIGN 5246 "1:\n"5328 "1: \n" 5247 5329 "movq " MEMACCESS(1) ",%%xmm2 \n" 5248 5330 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 … … 5273 5355 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 5274 5356 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 5275 uint8* dst_argb, const float* poly, 5357 uint8* dst_argb, 5358 const float* poly, 5276 5359 int width) { 5277 5360 asm volatile ( … … 5280 5363 // 2 pixel loop. 5281 5364 LABELALIGN 5282 "1:\n"5365 "1: \n" 5283 5366 "movq " MEMACCESS(0) ",%%xmm0 \n" 5284 5367 "lea " MEMLEA(0x8,0) ",%0 \n" … … 5329 5412 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 5330 5413 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 5331 uint8* dst_argb, const float* poly, 5414 uint8* dst_argb, 5415 const float* poly, 5332 5416 int width) { 5333 5417 asm volatile ( … … 5339 5423 // 2 pixel loop. 5340 5424 LABELALIGN 5341 "1:\n"5425 "1: \n" 5342 5426 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels 5343 5427 "lea " MEMLEA(0x8,0) ",%0 \n" … … 5367 5451 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 5368 5452 5453 #ifdef HAS_HALFFLOATROW_SSE2 5454 static float kScaleBias = 1.9259299444e-34f; 5455 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { 5456 asm volatile ( 5457 "pshufd $0x0,%3,%%xmm4 \n" 5458 "pxor %%xmm5,%%xmm5 \n" 5459 "sub %0,%1 \n" 5460 5461 // 16 pixel loop. 5462 LABELALIGN 5463 "1: \n" 5464 "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts 5465 "add $0x10,%0 \n" 5466 "movdqa %%xmm2,%%xmm3 \n" 5467 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 5468 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats 5469 "punpckhwd %%xmm5,%%xmm3 \n" 5470 "cvtdq2ps %%xmm3,%%xmm3 \n" 5471 "mulps %%xmm4,%%xmm2 \n" 5472 "mulps %%xmm4,%%xmm3 \n" 5473 "psrld $0xd,%%xmm2 \n" 5474 "psrld $0xd,%%xmm3 \n" 5475 "packssdw %%xmm3,%%xmm2 \n" 5476 MEMOPMEM(movdqu,xmm2,-0x10,0,1,1) 5477 "sub $0x8,%2 \n" 5478 "jg 1b \n" 5479 : "+r"(src), // %0 5480 "+r"(dst), // %1 5481 "+r"(width) // %2 5482 : "x"(scale * kScaleBias) // %3 5483 : "memory", "cc", 5484 "xmm2", "xmm3", "xmm4", "xmm5" 5485 ); 5486 } 5487 #endif // HAS_HALFFLOATROW_SSE2 5488 5489 #ifdef HAS_HALFFLOATROW_AVX2 5490 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5491 asm volatile ( 5492 "vbroadcastss %3, %%ymm4 \n" 5493 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 5494 "sub %0,%1 \n" 5495 5496 // 16 pixel loop. 5497 LABELALIGN 5498 "1: \n" 5499 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts 5500 "add $0x20,%0 \n" 5501 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates 5502 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" 5503 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5504 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5505 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5506 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5507 "vpsrld $0xd,%%ymm3,%%ymm3 \n" 5508 "vpsrld $0xd,%%ymm2,%%ymm2 \n" 5509 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates 5510 MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1) 5511 "sub $0x10,%2 \n" 5512 "jg 1b \n" 5513 5514 "vzeroupper \n" 5515 : "+r"(src), // %0 5516 "+r"(dst), // %1 5517 "+r"(width) // %2 5518 : "x"(scale * kScaleBias) // %3 5519 : "memory", "cc", 5520 "xmm2", "xmm3", "xmm4", "xmm5" 5521 ); 5522 } 5523 #endif // HAS_HALFFLOATROW_AVX2 5524 5525 #ifdef HAS_HALFFLOATROW_F16C 5526 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { 5527 asm volatile ( 5528 "vbroadcastss %3, %%ymm4 \n" 5529 "sub %0,%1 \n" 5530 5531 // 16 pixel loop. 5532 LABELALIGN 5533 "1: \n" 5534 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints 5535 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" 5536 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5537 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5538 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5539 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5540 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5541 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5542 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) 5543 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) 5544 "add $0x20,%0 \n" 5545 "sub $0x10,%2 \n" 5546 "jg 1b \n" 5547 "vzeroupper \n" 5548 : "+r"(src), // %0 5549 "+r"(dst), // %1 5550 "+r"(width) // %2 5551 : "x"(scale) // %3 5552 : "memory", "cc", 5553 "xmm2", "xmm3", "xmm4" 5554 ); 5555 } 5556 #endif // HAS_HALFFLOATROW_F16C 5557 5558 #ifdef HAS_HALFFLOATROW_F16C 5559 void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { 5560 asm volatile ( 5561 "sub %0,%1 \n" 5562 // 16 pixel loop. 5563 LABELALIGN 5564 "1: \n" 5565 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints 5566 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" 5567 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5568 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5569 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5570 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5571 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) 5572 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) 5573 "add $0x20,%0 \n" 5574 "sub $0x10,%2 \n" 5575 "jg 1b \n" 5576 "vzeroupper \n" 5577 : "+r"(src), // %0 5578 "+r"(dst), // %1 5579 "+r"(width) // %2 5580 : 5581 : "memory", "cc", 5582 "xmm2", "xmm3" 5583 ); 5584 } 5585 #endif // HAS_HALFFLOATROW_F16C 5586 5369 5587 #ifdef HAS_ARGBCOLORTABLEROW_X86 5370 5588 // Tranform ARGB pixels with color table. 5371 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 5589 void ARGBColorTableRow_X86(uint8* dst_argb, 5590 const uint8* table_argb, 5372 5591 int width) { 5373 5592 uintptr_t pixel_temp; … … 5375 5594 // 1 pixel loop. 5376 5595 LABELALIGN 5377 "1:\n"5596 "1: \n" 5378 5597 "movzb " MEMACCESS(0) ",%1 \n" 5379 5598 "lea " MEMLEA(0x4,0) ",%0 \n" … … 5406 5625 // 1 pixel loop. 5407 5626 LABELALIGN 5408 "1:\n"5627 "1: \n" 5409 5628 "movzb " MEMACCESS(0) ",%1 \n" 5410 5629 "lea " MEMLEA(0x4,0) ",%0 \n" … … 5429 5648 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 5430 5649 // Tranform RGB pixels with luma table. 5431 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5650 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, 5651 uint8* dst_argb, 5432 5652 int width, 5433 const uint8* luma, uint32 lumacoeff) { 5653 const uint8* luma, 5654 uint32 lumacoeff) { 5434 5655 uintptr_t pixel_temp; 5435 5656 uintptr_t table_temp; … … 5443 5664 // 4 pixel loop. 5444 5665 LABELALIGN 5445 "1:\n"5666 "1: \n" 5446 5667 "movdqu " MEMACCESS(2) ",%%xmm0 \n" 5447 5668 "pmaddubsw %%xmm3,%%xmm0 \n"
Note: See TracChangeset
for help on using the changeset viewer.