Changeset 5699
- Timestamp:
- Nov 21, 2017 9:25:11 AM (7 years ago)
- Location:
- pjproject/trunk/third_party
- Files:
-
- 39 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/build/yuv/Notes.txt
r5633 r5699 1 1 Notes: 2 2 3 * Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 27 July2017.3 * Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 17 November 2017. 4 4 5 * All code is compilable, except for compare_win.cc6 - Use older version (https://chromium.googlesource.com/libyuv/libyuv/+/baf6a3c1bd385e7ffe6b7634560e71fb49e4f589%5E%21/)7 Since there's a compiler error on:8 --------------------------------------------------------------------------------------9 pmulld xmm0,xmm610 --------------------------------------------------------------------------------------11 12 - On VS2015, error C2024: 'alignas' attribute applies to variables, data members and tag types only13 --------------------------------------------------------------------------------------14 __declspec(naked) __declspec(align(16))15 16 Change to :17 18 __declspec(naked)19 --------------------------------------------------------------------------------------20 21 * Added these lines to file include/libyuv/basic_types.h:22 --23 #if _MSC_VER==140024 # include <stdint.h> // for uint8_t25 #endif26 ...27 #if defined(_MSC_VER)28 # pragma warning(disable:4996) // This function or variable may be unsafe.29 #endif30 -- -
pjproject/trunk/third_party/yuv/include/libyuv/basic_types.h
r5633 r5699 15 15 16 16 #if defined(_MSC_VER) && (_MSC_VER < 1600) 17 #if _MSC_VER==140018 # include <stdint.h> // for uint8_t19 #endif20 17 #include <sys/types.h> // for uintptr_t on x86 21 18 #else 22 19 #include <stdint.h> // for uintptr_t 23 #endif24 25 #if defined(_MSC_VER)26 # pragma warning(disable:4996) // This function or variable may be unsafe.27 20 #endif 28 21 -
pjproject/trunk/third_party/yuv/include/libyuv/compare_row.h
r5633 r5699 20 20 21 21 #if defined(__pnacl__) || defined(__CLR_VER) || \ 22 (defined(__i386__) && !defined(__SSE 2__))22 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 23 23 #define LIBYUV_DISABLE_X86 24 24 #endif … … 43 43 #endif // __clang__ 44 44 45 // The following are available for Visual C: 45 46 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ 46 47 (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) … … 53 54 #define HAS_HASHDJB2_SSE41 54 55 #define HAS_SUMSQUAREERROR_SSE2 55 #define HAS_HAMMINGDISTANCE_ X8656 #define HAS_HAMMINGDISTANCE_SSE42 56 57 #endif 57 58 … … 63 64 #endif 64 65 66 // The following are available for GCC and clangcl 64 bit: 67 #if !defined(LIBYUV_DISABLE_X86) && \ 68 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 69 #define HAS_HAMMINGDISTANCE_SSSE3 70 #endif 71 72 // The following are available for GCC and clangcl 64 bit: 73 #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ 74 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 75 #define HAS_HAMMINGDISTANCE_AVX2 76 #endif 77 65 78 // The following are available for Neon: 66 79 #if !defined(LIBYUV_DISABLE_NEON) && \ … … 70 83 #endif 71 84 85 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 86 #define HAS_HAMMINGDISTANCE_MSA 87 #define HAS_SUMSQUAREERROR_MSA 88 #endif 89 72 90 uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); 73 uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); 91 uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count); 92 uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); 93 uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); 74 94 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); 95 uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); 75 96 76 97 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); … … 78 99 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); 79 100 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); 101 uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count); 80 102 81 103 uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); -
pjproject/trunk/third_party/yuv/include/libyuv/convert_from.h
r5633 r5699 179 179 LIBYUV_API 180 180 int I420ToRAW(const uint8* src_y, 181 int src_stride_y, 182 const uint8* src_u, 183 int src_stride_u, 184 const uint8* src_v, 185 int src_stride_v, 186 uint8* dst_frame, 187 int dst_stride_frame, 188 int width, 189 int height); 190 191 LIBYUV_API 192 int H420ToRGB24(const uint8* src_y, 193 int src_stride_y, 194 const uint8* src_u, 195 int src_stride_u, 196 const uint8* src_v, 197 int src_stride_v, 198 uint8* dst_frame, 199 int dst_stride_frame, 200 int width, 201 int height); 202 203 LIBYUV_API 204 int H420ToRAW(const uint8* src_y, 181 205 int src_stride_y, 182 206 const uint8* src_u, -
pjproject/trunk/third_party/yuv/include/libyuv/cpu_id.h
r5633 r5699 37 37 static const int kCpuHasERMS = 0x800; 38 38 static const int kCpuHasFMA3 = 0x1000; 39 static const int kCpuHasAVX3 = 0x2000; 40 static const int kCpuHasF16C = 0x4000; 41 42 // 0x8000 reserved for future X86 flags. 39 static const int kCpuHasF16C = 0x2000; 40 static const int kCpuHasGFNI = 0x4000; 41 static const int kCpuHasAVX512BW = 0x8000; 42 static const int kCpuHasAVX512VL = 0x10000; 43 static const int kCpuHasAVX512VBMI = 0x20000; 44 static const int kCpuHasAVX512VBMI2 = 0x40000; 45 static const int kCpuHasAVX512VBITALG = 0x80000; 46 static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; 43 47 44 48 // These flags are only valid on MIPS processors. 45 static const int kCpuHasMIPS = 0x 10000;46 static const int kCpuHasDSPR2 = 0x 20000;47 static const int kCpuHasMSA = 0x 40000;49 static const int kCpuHasMIPS = 0x200000; 50 static const int kCpuHasDSPR2 = 0x400000; 51 static const int kCpuHasMSA = 0x800000; 48 52 49 53 // Optional init function. TestCpuFlag does an auto-init. -
pjproject/trunk/third_party/yuv/include/libyuv/planar_functions.h
r5633 r5699 69 69 int width, 70 70 int height); 71 72 // Split interleaved RGB plane into separate R, G and B planes. 73 LIBYUV_API 74 void SplitRGBPlane(const uint8* src_rgb, 75 int src_stride_rgb, 76 uint8* dst_r, 77 int dst_stride_r, 78 uint8* dst_g, 79 int dst_stride_g, 80 uint8* dst_b, 81 int dst_stride_b, 82 int width, 83 int height); 84 85 // Merge separate R, G and B planes into one interleaved RGB plane. 86 LIBYUV_API 87 void MergeRGBPlane(const uint8* src_r, 88 int src_stride_r, 89 const uint8* src_g, 90 int src_stride_g, 91 const uint8* src_b, 92 int src_stride_b, 93 uint8* dst_rgb, 94 int dst_stride_rgb, 95 int width, 96 int height); 71 97 72 98 // Copy I400. Supports inverting. … … 721 747 722 748 #if defined(__pnacl__) || defined(__CLR_VER) || \ 723 (defined(__i386__) && !defined(__SSE 2__))749 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 724 750 #define LIBYUV_DISABLE_X86 725 751 #endif -
pjproject/trunk/third_party/yuv/include/libyuv/rotate_row.h
r5633 r5699 20 20 21 21 #if defined(__pnacl__) || defined(__CLR_VER) || \ 22 (defined(__i386__) && !defined(__SSE 2__))22 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 23 23 #define LIBYUV_DISABLE_X86 24 24 #endif … … 30 30 #endif 31 31 // The following are available for Visual C and clangcl 32 bit: 32 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 32 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 33 33 #define HAS_TRANSPOSEWX8_SSSE3 34 34 #define HAS_TRANSPOSEUVWX8_SSE2 -
pjproject/trunk/third_party/yuv/include/libyuv/row.h
r5633 r5699 32 32 33 33 #if defined(__pnacl__) || defined(__CLR_VER) || \ 34 (defined(__i386__) && !defined(__SSE 2__))34 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 35 35 #define LIBYUV_DISABLE_X86 36 36 #endif … … 265 265 #endif 266 266 267 // The following are available for gcc/clang x86 platforms: 268 // TODO(fbarchard): Port to Visual C 269 #if !defined(LIBYUV_DISABLE_X86) && \ 270 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 271 #define HAS_MERGERGBROW_SSSE3 272 #define HAS_SPLITRGBROW_SSSE3 273 #endif 274 275 // The following are available for AVX2 gcc/clang x86 platforms: 276 // TODO(fbarchard): Port to Visual C 277 #if !defined(LIBYUV_DISABLE_X86) && \ 278 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ 279 (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) 280 #define HAS_MERGEUVROW_16_AVX2 281 #define HAS_MULTIPLYROW_16_AVX2 282 #endif 283 267 284 // The following are available on Neon platforms: 268 285 #if !defined(LIBYUV_DISABLE_NEON) && \ … … 324 341 #define HAS_RGBATOYROW_NEON 325 342 #define HAS_SETROW_NEON 343 #define HAS_SPLITRGBROW_NEON 326 344 #define HAS_SPLITUVROW_NEON 327 345 #define HAS_UYVYTOARGBROW_NEON … … 353 371 #define HAS_SOBELXYROW_NEON 354 372 #define HAS_SOBELYROW_NEON 373 #endif 374 375 // The following are available on AArch64 platforms: 376 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 377 #define HAS_SCALESUMSAMPLES_NEON 355 378 #endif 356 379 … … 386 409 387 410 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 411 #define HAS_ABGRTOUVROW_MSA 412 #define HAS_ABGRTOYROW_MSA 413 #define HAS_ARGB1555TOARGBROW_MSA 414 #define HAS_ARGB1555TOUVROW_MSA 415 #define HAS_ARGB1555TOYROW_MSA 416 #define HAS_ARGB4444TOARGBROW_MSA 417 #define HAS_ARGBADDROW_MSA 418 #define HAS_ARGBATTENUATEROW_MSA 419 #define HAS_ARGBBLENDROW_MSA 420 #define HAS_ARGBCOLORMATRIXROW_MSA 421 #define HAS_ARGBEXTRACTALPHAROW_MSA 422 #define HAS_ARGBGRAYROW_MSA 388 423 #define HAS_ARGBMIRRORROW_MSA 424 #define HAS_ARGBMULTIPLYROW_MSA 425 #define HAS_ARGBQUANTIZEROW_MSA 426 #define HAS_ARGBSEPIAROW_MSA 427 #define HAS_ARGBSETROW_MSA 428 #define HAS_ARGBSHADEROW_MSA 429 #define HAS_ARGBSHUFFLEROW_MSA 430 #define HAS_ARGBSUBTRACTROW_MSA 431 #define HAS_ARGBTOARGB1555ROW_MSA 432 #define HAS_ARGBTOARGB4444ROW_MSA 433 #define HAS_ARGBTORAWROW_MSA 434 #define HAS_ARGBTORGB24ROW_MSA 435 #define HAS_ARGBTORGB565DITHERROW_MSA 436 #define HAS_ARGBTORGB565ROW_MSA 437 #define HAS_ARGBTOUV444ROW_MSA 438 #define HAS_ARGBTOUVJROW_MSA 439 #define HAS_ARGBTOUVROW_MSA 440 #define HAS_ARGBTOYJROW_MSA 441 #define HAS_ARGBTOYROW_MSA 442 #define HAS_BGRATOUVROW_MSA 443 #define HAS_BGRATOYROW_MSA 444 #define HAS_HALFFLOATROW_MSA 445 #define HAS_I400TOARGBROW_MSA 446 #define HAS_I422ALPHATOARGBROW_MSA 447 #define HAS_I422TOARGBROW_MSA 448 #define HAS_I422TORGB24ROW_MSA 449 #define HAS_I422TORGBAROW_MSA 389 450 #define HAS_I422TOUYVYROW_MSA 390 451 #define HAS_I422TOYUY2ROW_MSA 452 #define HAS_I444TOARGBROW_MSA 453 #define HAS_INTERPOLATEROW_MSA 454 #define HAS_J400TOARGBROW_MSA 455 #define HAS_MERGEUVROW_MSA 391 456 #define HAS_MIRRORROW_MSA 457 #define HAS_MIRRORUVROW_MSA 458 #define HAS_NV12TOARGBROW_MSA 459 #define HAS_NV12TORGB565ROW_MSA 460 #define HAS_NV21TOARGBROW_MSA 461 #define HAS_RAWTOARGBROW_MSA 462 #define HAS_RAWTORGB24ROW_MSA 463 #define HAS_RAWTOUVROW_MSA 464 #define HAS_RAWTOYROW_MSA 465 #define HAS_RGB24TOARGBROW_MSA 466 #define HAS_RGB24TOUVROW_MSA 467 #define HAS_RGB24TOYROW_MSA 468 #define HAS_RGB565TOARGBROW_MSA 469 #define HAS_RGB565TOUVROW_MSA 470 #define HAS_RGB565TOYROW_MSA 471 #define HAS_RGBATOUVROW_MSA 472 #define HAS_RGBATOYROW_MSA 473 #define HAS_SETROW_MSA 474 #define HAS_SOBELROW_MSA 475 #define HAS_SOBELTOPLANEROW_MSA 476 #define HAS_SOBELXROW_MSA 477 #define HAS_SOBELXYROW_MSA 478 #define HAS_SOBELYROW_MSA 479 #define HAS_SPLITUVROW_MSA 480 #define HAS_UYVYTOARGBROW_MSA 392 481 #define HAS_UYVYTOUVROW_MSA 393 482 #define HAS_UYVYTOYROW_MSA 483 #define HAS_YUY2TOARGBROW_MSA 394 484 #define HAS_YUY2TOUV422ROW_MSA 395 485 #define HAS_YUY2TOUVROW_MSA 396 486 #define HAS_YUY2TOYROW_MSA 397 #define HAS_ARGB4444TOARGBROW_MSA398 #define HAS_ARGBTOYROW_MSA399 #define HAS_ARGBTOUVROW_MSA400 #define HAS_I422TOARGBROW_MSA401 #define HAS_I422TORGBAROW_MSA402 #define HAS_I422ALPHATOARGBROW_MSA403 #define HAS_I422TORGB24ROW_MSA404 #define HAS_ARGBTORGB24ROW_MSA405 #define HAS_ARGBTORAWROW_MSA406 #define HAS_ARGBTORGB565ROW_MSA407 #define HAS_ARGBTOARGB1555ROW_MSA408 #define HAS_ARGBTOARGB4444ROW_MSA409 #define HAS_ARGBTOUV444ROW_MSA410 #define HAS_ARGBMULTIPLYROW_MSA411 #define HAS_ARGBADDROW_MSA412 #define HAS_ARGBSUBTRACTROW_MSA413 #define HAS_ARGBATTENUATEROW_MSA414 #define HAS_ARGBTORGB565DITHERROW_MSA415 #define HAS_ARGBSHUFFLEROW_MSA416 #define HAS_ARGBSHADEROW_MSA417 #define HAS_ARGBGRAYROW_MSA418 #define HAS_ARGBSEPIAROW_MSA419 #define HAS_ARGB1555TOARGBROW_MSA420 #define HAS_RGB565TOARGBROW_MSA421 #define HAS_RGB24TOARGBROW_MSA422 #define HAS_RAWTOARGBROW_MSA423 #define HAS_ARGB1555TOYROW_MSA424 #define HAS_RGB565TOYROW_MSA425 #define HAS_RGB24TOYROW_MSA426 #define HAS_RAWTOYROW_MSA427 #define HAS_ARGB1555TOUVROW_MSA428 #define HAS_RGB565TOUVROW_MSA429 #define HAS_RGB24TOUVROW_MSA430 #define HAS_RAWTOUVROW_MSA431 #define HAS_NV12TOARGBROW_MSA432 #define HAS_NV12TORGB565ROW_MSA433 #define HAS_NV21TOARGBROW_MSA434 #define HAS_SOBELROW_MSA435 #define HAS_SOBELTOPLANEROW_MSA436 #define HAS_SOBELXYROW_MSA437 #define HAS_ARGBTOYJROW_MSA438 #define HAS_BGRATOYROW_MSA439 #define HAS_ABGRTOYROW_MSA440 #define HAS_RGBATOYROW_MSA441 #define HAS_ARGBTOUVJROW_MSA442 #define HAS_BGRATOUVROW_MSA443 #define HAS_ABGRTOUVROW_MSA444 #define HAS_RGBATOUVROW_MSA445 #define HAS_I444TOARGBROW_MSA446 #define HAS_I400TOARGBROW_MSA447 #define HAS_J400TOARGBROW_MSA448 #define HAS_YUY2TOARGBROW_MSA449 #define HAS_UYVYTOARGBROW_MSA450 #define HAS_INTERPOLATEROW_MSA451 #define HAS_ARGBSETROW_MSA452 #define HAS_RAWTORGB24ROW_MSA453 #define HAS_MERGEUVROW_MSA454 487 #endif 455 488 … … 1346 1379 uint8* dst_v, 1347 1380 int width); 1381 void MirrorUVRow_MSA(const uint8* src_uv, 1382 uint8* dst_u, 1383 uint8* dst_v, 1384 int width); 1348 1385 void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); 1349 1386 … … 1375 1412 uint8* dst_v, 1376 1413 int width); 1414 void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); 1377 1415 void SplitUVRow_Any_SSE2(const uint8* src_uv, 1378 1416 uint8* dst_u, … … 1391 1429 uint8* dst_v, 1392 1430 int width); 1431 void SplitUVRow_Any_MSA(const uint8* src_uv, 1432 uint8* dst_u, 1433 uint8* dst_v, 1434 int width); 1393 1435 1394 1436 void MergeUVRow_C(const uint8* src_u, … … 1429 1471 int width); 1430 1472 1473 void SplitRGBRow_C(const uint8* src_rgb, 1474 uint8* dst_r, 1475 uint8* dst_g, 1476 uint8* dst_b, 1477 int width); 1478 void SplitRGBRow_SSSE3(const uint8* src_rgb, 1479 uint8* dst_r, 1480 uint8* dst_g, 1481 uint8* dst_b, 1482 int width); 1483 void SplitRGBRow_NEON(const uint8* src_rgb, 1484 uint8* dst_r, 1485 uint8* dst_g, 1486 uint8* dst_b, 1487 int width); 1488 void SplitRGBRow_Any_SSSE3(const uint8* src_rgb, 1489 uint8* dst_r, 1490 uint8* dst_g, 1491 uint8* dst_b, 1492 int width); 1493 void SplitRGBRow_Any_NEON(const uint8* src_rgb, 1494 uint8* dst_r, 1495 uint8* dst_g, 1496 uint8* dst_b, 1497 int width); 1498 1499 void MergeRGBRow_C(const uint8* src_r, 1500 const uint8* src_g, 1501 const uint8* src_b, 1502 uint8* dst_rgb, 1503 int width); 1504 void MergeRGBRow_SSSE3(const uint8* src_r, 1505 const uint8* src_g, 1506 const uint8* src_b, 1507 uint8* dst_rgb, 1508 int width); 1509 void MergeRGBRow_NEON(const uint8* src_r, 1510 const uint8* src_g, 1511 const uint8* src_b, 1512 uint8* dst_rgb, 1513 int width); 1514 void MergeRGBRow_Any_SSSE3(const uint8* src_r, 1515 const uint8* src_g, 1516 const uint8* src_b, 1517 uint8* dst_rgb, 1518 int width); 1519 void MergeRGBRow_Any_NEON(const uint8* src_r, 1520 const uint8* src_g, 1521 const uint8* src_b, 1522 uint8* dst_rgb, 1523 int width); 1524 1525 void MergeUVRow_16_C(const uint16* src_u, 1526 const uint16* src_v, 1527 uint16* dst_uv, 1528 int scale, /* 64 for 10 bit */ 1529 int width); 1530 void MergeUVRow_16_AVX2(const uint16* src_u, 1531 const uint16* src_v, 1532 uint16* dst_uv, 1533 int scale, 1534 int width); 1535 1536 void MultiplyRow_16_AVX2(const uint16* src_y, 1537 uint16* dst_y, 1538 int scale, 1539 int width); 1540 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); 1541 1431 1542 void CopyRow_SSE2(const uint8* src, uint8* dst, int count); 1432 1543 void CopyRow_AVX(const uint8* src, uint8* dst, int count); … … 1455 1566 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width); 1456 1567 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); 1568 void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width); 1457 1569 void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, 1458 1570 uint8* dst_a, … … 1464 1576 uint8* dst_a, 1465 1577 int width); 1578 void ARGBExtractAlphaRow_Any_MSA(const uint8* src_argb, 1579 uint8* dst_a, 1580 int width); 1466 1581 1467 1582 void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); … … 1476 1591 1477 1592 void SetRow_C(uint8* dst, uint8 v8, int count); 1593 void SetRow_MSA(uint8* dst, uint8 v8, int count); 1478 1594 void SetRow_X86(uint8* dst, uint8 v8, int count); 1479 1595 void SetRow_ERMS(uint8* dst, uint8 v8, int count); … … 2123 2239 uint8* dst_argb, 2124 2240 int width); 2241 void ARGBBlendRow_MSA(const uint8* src_argb, 2242 const uint8* src_argb1, 2243 uint8* dst_argb, 2244 int width); 2125 2245 void ARGBBlendRow_C(const uint8* src_argb, 2126 2246 const uint8* src_argb1, … … 2836 2956 const int8* matrix_argb, 2837 2957 int width); 2958 void ARGBColorMatrixRow_MSA(const uint8* src_argb, 2959 uint8* dst_argb, 2960 const int8* matrix_argb, 2961 int width); 2838 2962 2839 2963 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); … … 2858 2982 int interval_offset, 2859 2983 int width); 2984 void ARGBQuantizeRow_MSA(uint8* dst_argb, 2985 int scale, 2986 int interval_size, 2987 int interval_offset, 2988 int width); 2860 2989 2861 2990 void ARGBShadeRow_C(const uint8* src_argb, … … 2991 3120 uint8* dst_sobelx, 2992 3121 int width); 3122 void SobelXRow_MSA(const uint8* src_y0, 3123 const uint8* src_y1, 3124 const uint8* src_y2, 3125 uint8* dst_sobelx, 3126 int width); 2993 3127 void SobelYRow_C(const uint8* src_y0, 2994 3128 const uint8* src_y1, … … 3003 3137 uint8* dst_sobely, 3004 3138 int width); 3139 void SobelYRow_MSA(const uint8* src_y0, 3140 const uint8* src_y1, 3141 uint8* dst_sobely, 3142 int width); 3005 3143 void SobelRow_C(const uint8* src_sobelx, 3006 3144 const uint8* src_sobely, … … 3133 3271 float scale, 3134 3272 int width); 3273 void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width); 3274 void HalfFloatRow_Any_MSA(const uint16* src, 3275 uint16* dst, 3276 float scale, 3277 int width); 3135 3278 3136 3279 void ARGBLumaColorTableRow_C(const uint8* src_argb, … … 3145 3288 uint32 lumacoeff); 3146 3289 3290 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); 3291 float ScaleMaxSamples_NEON(const float* src, 3292 float* dst, 3293 float scale, 3294 int width); 3295 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); 3296 float ScaleSumSamples_NEON(const float* src, 3297 float* dst, 3298 float scale, 3299 int width); 3300 void ScaleSamples_C(const float* src, float* dst, float scale, int width); 3301 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); 3302 3147 3303 #ifdef __cplusplus 3148 3304 } // extern "C" -
pjproject/trunk/third_party/yuv/include/libyuv/scale_row.h
r5633 r5699 21 21 22 22 #if defined(__pnacl__) || defined(__CLR_VER) || \ 23 (defined(__i386__) && !defined(__SSE 2__))23 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 24 24 #define LIBYUV_DISABLE_X86 25 25 #endif … … 106 106 107 107 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 108 #define HAS_SCALEADDROW_MSA 109 #define HAS_SCALEARGBCOLS_MSA 110 #define HAS_SCALEARGBFILTERCOLS_MSA 108 111 #define HAS_SCALEARGBROWDOWN2_MSA 109 112 #define HAS_SCALEARGBROWDOWNEVEN_MSA 113 #define HAS_SCALEFILTERCOLS_MSA 110 114 #define HAS_SCALEROWDOWN2_MSA 115 #define HAS_SCALEROWDOWN34_MSA 116 #define HAS_SCALEROWDOWN38_MSA 111 117 #define HAS_SCALEROWDOWN4_MSA 112 #define HAS_SCALEROWDOWN38_MSA113 #define HAS_SCALEADDROW_MSA114 118 #endif 115 119 … … 547 551 int x, 548 552 int dx); 553 void ScaleARGBFilterCols_MSA(uint8* dst_argb, 554 const uint8* src_argb, 555 int dst_width, 556 int x, 557 int dx); 558 void ScaleARGBCols_MSA(uint8* dst_argb, 559 const uint8* src_argb, 560 int dst_width, 561 int x, 562 int dx); 563 void ScaleARGBFilterCols_Any_MSA(uint8* dst_argb, 564 const uint8* src_argb, 565 int dst_width, 566 int x, 567 int dx); 568 void ScaleARGBCols_Any_MSA(uint8* dst_argb, 569 const uint8* src_argb, 570 int dst_width, 571 int x, 572 int dx); 549 573 550 574 // ARGB Row functions … … 886 910 int dst_width); 887 911 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); 912 void ScaleFilterCols_MSA(uint8* dst_ptr, 913 const uint8* src_ptr, 914 int dst_width, 915 int x, 916 int dx); 917 void ScaleRowDown34_MSA(const uint8* src_ptr, 918 ptrdiff_t src_stride, 919 uint8* dst_ptr, 920 int dst_width); 921 void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr, 922 ptrdiff_t src_stride, 923 uint8* dst_ptr, 924 int dst_width); 925 void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr, 926 ptrdiff_t src_stride, 927 uint8* dst_ptr, 928 int dst_width); 929 888 930 void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, 889 931 ptrdiff_t src_stride, … … 921 963 uint16_t* dst_ptr, 922 964 int src_width); 965 void ScaleFilterCols_Any_MSA(uint8* dst_ptr, 966 const uint8* src_ptr, 967 int dst_width, 968 int x, 969 int dx); 970 void ScaleRowDown34_Any_MSA(const uint8* src_ptr, 971 ptrdiff_t src_stride, 972 uint8* dst_ptr, 973 int dst_width); 974 void ScaleRowDown34_0_Box_Any_MSA(const uint8* src_ptr, 975 ptrdiff_t src_stride, 976 uint8* dst_ptr, 977 int dst_width); 978 void ScaleRowDown34_1_Box_Any_MSA(const uint8* src_ptr, 979 ptrdiff_t src_stride, 980 uint8* dst_ptr, 981 int dst_width); 923 982 924 983 #ifdef __cplusplus -
pjproject/trunk/third_party/yuv/include/libyuv/version.h
r5633 r5699 12 12 #define INCLUDE_LIBYUV_VERSION_H_ 13 13 14 #define LIBYUV_VERSION 16 6214 #define LIBYUV_VERSION 1678 15 15 16 16 #endif // INCLUDE_LIBYUV_VERSION_H_ -
pjproject/trunk/third_party/yuv/source/compare.cc
r5633 r5699 111 111 } 112 112 113 // NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. 114 // So actual maximum is 1 less loop, which is 64436 - 32 bytes. 115 113 116 LIBYUV_API 114 117 uint64 ComputeHammingDistance(const uint8* src_a, 115 118 const uint8* src_b, 116 119 int count) { 117 const int kBlockSize = 65536; 118 int remainder = count & (kBlockSize - 1) & ~31; 120 const int kBlockSize = 1 << 15; // 32768; 121 const int kSimdSize = 64; 122 // SIMD for multiple of 64, and C for remainder 123 int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); 119 124 uint64 diff = 0; 120 125 int i; … … 126 131 } 127 132 #endif 128 #if defined(HAS_HAMMINGDISTANCE_X86) 129 if (TestCpuFlag(kCpuHasX86)) { 130 HammingDistance = HammingDistance_X86; 133 #if defined(HAS_HAMMINGDISTANCE_SSSE3) 134 if (TestCpuFlag(kCpuHasSSSE3)) { 135 HammingDistance = HammingDistance_SSSE3; 136 } 137 #endif 138 #if defined(HAS_HAMMINGDISTANCE_SSE42) 139 if (TestCpuFlag(kCpuHasSSE42)) { 140 HammingDistance = HammingDistance_SSE42; 131 141 } 132 142 #endif … … 136 146 } 137 147 #endif 148 #if defined(HAS_HAMMINGDISTANCE_MSA) 149 if (TestCpuFlag(kCpuHasMSA)) { 150 HammingDistance = HammingDistance_MSA; 151 } 152 #endif 138 153 #ifdef _OPENMP 139 154 #pragma omp parallel for reduction(+ : diff) … … 149 164 src_b += remainder; 150 165 } 151 remainder = count & 31;166 remainder = count & (kSimdSize - 1); 152 167 if (remainder) { 153 168 diff += HammingDistance_C(src_a, src_b, remainder); … … 185 200 // Note only used for multiples of 32 so count is not checked. 186 201 SumSquareError = SumSquareError_AVX2; 202 } 203 #endif 204 #if defined(HAS_SUMSQUAREERROR_MSA) 205 if (TestCpuFlag(kCpuHasMSA)) { 206 SumSquareError = SumSquareError_MSA; 187 207 } 188 208 #endif -
pjproject/trunk/third_party/yuv/source/compare_common.cc
r5633 r5699 19 19 20 20 #if ORIGINAL_OPT 21 uint32 HammingDistance_C (const uint8* src_a, const uint8* src_b, int count) {21 uint32 HammingDistance_C1(const uint8* src_a, const uint8* src_b, int count) { 22 22 uint32 diff = 0u; 23 23 … … 59 59 src_b += 4; 60 60 } 61 62 for (; i < count; ++i) { 63 uint32 x = *src_a ^ *src_b; 64 uint32 u = x - ((x >> 1) & 0x55); 65 u = ((u >> 2) & 0x33) + (u & 0x33); 66 diff += (u + (u >> 4)) & 0x0f; 67 src_a += 1; 68 src_b += 1; 69 } 70 61 71 return diff; 62 72 } -
pjproject/trunk/third_party/yuv/source/compare_gcc.cc
r5633 r5699 23 23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 24 24 25 uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { 25 #if defined(__x86_64__) 26 uint32 HammingDistance_SSE42(const uint8* src_a, 27 const uint8* src_b, 28 int count) { 29 uint64 diff = 0u; 30 31 asm volatile( 32 "xor %3,%3 \n" 33 "xor %%r8,%%r8 \n" 34 "xor %%r9,%%r9 \n" 35 "xor %%r10,%%r10 \n" 36 37 // Process 32 bytes per loop. 38 LABELALIGN 39 "1: \n" 40 "mov (%0),%%rcx \n" 41 "mov 0x8(%0),%%rdx \n" 42 "xor (%1),%%rcx \n" 43 "xor 0x8(%1),%%rdx \n" 44 "popcnt %%rcx,%%rcx \n" 45 "popcnt %%rdx,%%rdx \n" 46 "mov 0x10(%0),%%rsi \n" 47 "mov 0x18(%0),%%rdi \n" 48 "xor 0x10(%1),%%rsi \n" 49 "xor 0x18(%1),%%rdi \n" 50 "popcnt %%rsi,%%rsi \n" 51 "popcnt %%rdi,%%rdi \n" 52 "add $0x20,%0 \n" 53 "add $0x20,%1 \n" 54 "add %%rcx,%3 \n" 55 "add %%rdx,%%r8 \n" 56 "add %%rsi,%%r9 \n" 57 "add %%rdi,%%r10 \n" 58 "sub $0x20,%2 \n" 59 "jg 1b \n" 60 61 "add %%r8, %3 \n" 62 "add %%r9, %3 \n" 63 "add %%r10, %3 \n" 64 : "+r"(src_a), // %0 65 "+r"(src_b), // %1 66 "+r"(count), // %2 67 "=r"(diff) // %3 68 : 69 : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); 70 71 return static_cast<uint32>(diff); 72 } 73 #else 74 uint32 HammingDistance_SSE42(const uint8* src_a, 75 const uint8* src_b, 76 int count) { 26 77 uint32 diff = 0u; 27 78 28 int i; 29 for (i = 0; i < count - 7; i += 8) { 30 uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); 31 src_a += 8; 32 src_b += 8; 33 diff += __builtin_popcountll(x); 34 } 79 asm volatile( 80 // Process 16 bytes per loop. 81 LABELALIGN 82 "1: \n" 83 "mov (%0),%%ecx \n" 84 "mov 0x4(%0),%%edx \n" 85 "xor (%1),%%ecx \n" 86 "xor 0x4(%1),%%edx \n" 87 "popcnt %%ecx,%%ecx \n" 88 "add %%ecx,%3 \n" 89 "popcnt %%edx,%%edx \n" 90 "add %%edx,%3 \n" 91 "mov 0x8(%0),%%ecx \n" 92 "mov 0xc(%0),%%edx \n" 93 "xor 0x8(%1),%%ecx \n" 94 "xor 0xc(%1),%%edx \n" 95 "popcnt %%ecx,%%ecx \n" 96 "add %%ecx,%3 \n" 97 "popcnt %%edx,%%edx \n" 98 "add %%edx,%3 \n" 99 "add $0x10,%0 \n" 100 "add $0x10,%1 \n" 101 "sub $0x10,%2 \n" 102 "jg 1b \n" 103 : "+r"(src_a), // %0 104 "+r"(src_b), // %1 105 "+r"(count), // %2 106 "+r"(diff) // %3 107 : 108 : "memory", "cc", "ecx", "edx"); 109 35 110 return diff; 36 111 } 112 #endif 113 114 static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 115 15, 15, 15, 15, 15, 15, 15, 15}; 116 static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 117 118 uint32 HammingDistance_SSSE3(const uint8* src_a, 119 const uint8* src_b, 120 int count) { 121 uint32 diff = 0u; 122 123 asm volatile( 124 "movdqa %4,%%xmm2 \n" 125 "movdqa %5,%%xmm3 \n" 126 "pxor %%xmm0,%%xmm0 \n" 127 "pxor %%xmm1,%%xmm1 \n" 128 "sub %0,%1 \n" 129 130 LABELALIGN 131 "1: \n" 132 "movdqa (%0),%%xmm4 \n" 133 "movdqa 0x10(%0), %%xmm5 \n" 134 "pxor (%0,%1), %%xmm4 \n" 135 "movdqa %%xmm4,%%xmm6 \n" 136 "pand %%xmm2,%%xmm6 \n" 137 "psrlw $0x4,%%xmm4 \n" 138 "movdqa %%xmm3,%%xmm7 \n" 139 "pshufb %%xmm6,%%xmm7 \n" 140 "pand %%xmm2,%%xmm4 \n" 141 "movdqa %%xmm3,%%xmm6 \n" 142 "pshufb %%xmm4,%%xmm6 \n" 143 "paddb %%xmm7,%%xmm6 \n" 144 "pxor 0x10(%0,%1),%%xmm5 \n" 145 "add $0x20,%0 \n" 146 "movdqa %%xmm5,%%xmm4 \n" 147 "pand %%xmm2,%%xmm5 \n" 148 "psrlw $0x4,%%xmm4 \n" 149 "movdqa %%xmm3,%%xmm7 \n" 150 "pshufb %%xmm5,%%xmm7 \n" 151 "pand %%xmm2,%%xmm4 \n" 152 "movdqa %%xmm3,%%xmm5 \n" 153 "pshufb %%xmm4,%%xmm5 \n" 154 "paddb %%xmm7,%%xmm5 \n" 155 "paddb %%xmm5,%%xmm6 \n" 156 "psadbw %%xmm1,%%xmm6 \n" 157 "paddd %%xmm6,%%xmm0 \n" 158 "sub $0x20,%2 \n" 159 "jg 1b \n" 160 161 "pshufd $0xaa,%%xmm0,%%xmm1 \n" 162 "paddd %%xmm1,%%xmm0 \n" 163 "movd %%xmm0, %3 \n" 164 : "+r"(src_a), // %0 165 "+r"(src_b), // %1 166 "+r"(count), // %2 167 "=r"(diff) // %3 168 : "m"(kNibbleMask), // %4 169 "m"(kBitCount) // %5 170 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 171 "xmm7"); 172 173 return diff; 174 } 175 176 #ifdef HAS_HAMMINGDISTANCE_AVX2 177 uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { 178 uint32 diff = 0u; 179 180 asm volatile( 181 "vbroadcastf128 %4,%%ymm2 \n" 182 "vbroadcastf128 %5,%%ymm3 \n" 183 "vpxor %%ymm0,%%ymm0,%%ymm0 \n" 184 "vpxor %%ymm1,%%ymm1,%%ymm1 \n" 185 "sub %0,%1 \n" 186 187 LABELALIGN 188 "1: \n" 189 "vmovdqa (%0),%%ymm4 \n" 190 "vmovdqa 0x20(%0), %%ymm5 \n" 191 "vpxor (%0,%1), %%ymm4, %%ymm4 \n" 192 "vpand %%ymm2,%%ymm4,%%ymm6 \n" 193 "vpsrlw $0x4,%%ymm4,%%ymm4 \n" 194 "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" 195 "vpand %%ymm2,%%ymm4,%%ymm4 \n" 196 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" 197 "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" 198 "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" 199 "add $0x40,%0 \n" 200 "vpand %%ymm2,%%ymm4,%%ymm5 \n" 201 "vpsrlw $0x4,%%ymm4,%%ymm4 \n" 202 "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" 203 "vpand %%ymm2,%%ymm4,%%ymm4 \n" 204 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" 205 "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" 206 "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" 207 "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" 208 "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" 209 "sub $0x40,%2 \n" 210 "jg 1b \n" 211 212 "vpermq $0xb1,%%ymm0,%%ymm1 \n" 213 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" 214 "vpermq $0xaa,%%ymm0,%%ymm1 \n" 215 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" 216 "vmovd %%xmm0, %3 \n" 217 "vzeroupper \n" 218 : "+r"(src_a), // %0 219 "+r"(src_b), // %1 220 "+r"(count), // %2 221 "=r"(diff) // %3 222 : "m"(kNibbleMask), // %4 223 "m"(kBitCount) // %5 224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); 225 226 return diff; 227 } 228 #endif // HAS_HAMMINGDISTANCE_AVX2 37 229 38 230 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { -
pjproject/trunk/third_party/yuv/source/compare_neon.cc
r5633 r5699 27 27 uint32 diff; 28 28 29 asm volatile 30 "vmov.u16 q4, #0 \n" // accumulator29 asm volatile( 30 "vmov.u16 q4, #0 \n" // accumulator 31 31 32 "1:\n"33 "vld1.8 {q0, q1}, [%0]! \n"34 "vld1.8 {q2, q3}, [%1]! \n"35 "veor.32 q0, q0, q2 \n"36 "veor.32 q1, q1, q3 \n"37 "vcnt.i8 q0, q0 \n"38 "vcnt.i8 q1, q1 \n"39 "subs %2, %2, #32 \n"40 "vadd.u8 q0, q0, q1 \n" // 16 byte counts41 "vpadal.u8 q4, q0 \n" // 8 shorts42 "bgt 1b \n"32 "1: \n" 33 "vld1.8 {q0, q1}, [%0]! \n" 34 "vld1.8 {q2, q3}, [%1]! \n" 35 "veor.32 q0, q0, q2 \n" 36 "veor.32 q1, q1, q3 \n" 37 "vcnt.i8 q0, q0 \n" 38 "vcnt.i8 q1, q1 \n" 39 "subs %2, %2, #32 \n" 40 "vadd.u8 q0, q0, q1 \n" // 16 byte counts 41 "vpadal.u8 q4, q0 \n" // 8 shorts 42 "bgt 1b \n" 43 43 44 "vpaddl.u16 q0, q4 \n" // 4 ints 45 "vpadd.u32 d0, d0, d1 \n" 46 "vpadd.u32 d0, d0, d0 \n" 47 "vmov.32 %3, d0[0] \n" 48 49 : "+r"(src_a), 50 "+r"(src_b), 51 "+r"(count), 52 "=r"(diff) 53 : 54 : "cc", "q0", "q1", "q2", "q3", "q4"); 44 "vpaddl.u16 q0, q4 \n" // 4 ints 45 "vpadd.u32 d0, d0, d1 \n" 46 "vpadd.u32 d0, d0, d0 \n" 47 "vmov.32 %3, d0[0] \n" 48 49 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) 50 : 51 : "cc", "q0", "q1", "q2", "q3", "q4"); 55 52 return diff; 56 53 } … … 58 55 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { 59 56 uint32 sse; 60 asm volatile 61 "vmov.u8 q8, #0 \n"62 "vmov.u8 q10, #0 \n"63 "vmov.u8 q9, #0 \n"64 "vmov.u8 q11, #0 \n"57 asm volatile( 58 "vmov.u8 q8, #0 \n" 59 "vmov.u8 q10, #0 \n" 60 "vmov.u8 q9, #0 \n" 61 "vmov.u8 q11, #0 \n" 65 62 66 "1:\n"67 "vld1.8 {q0}, [%0]! \n"68 "vld1.8 {q1}, [%1]! \n"69 "subs %2, %2, #16 \n"70 "vsubl.u8 q2, d0, d2 \n"71 "vsubl.u8 q3, d1, d3 \n"72 "vmlal.s16 q8, d4, d4 \n"73 "vmlal.s16 q9, d6, d6 \n"74 "vmlal.s16 q10, d5, d5 \n"75 "vmlal.s16 q11, d7, d7 \n"76 "bgt 1b \n"63 "1: \n" 64 "vld1.8 {q0}, [%0]! \n" 65 "vld1.8 {q1}, [%1]! \n" 66 "subs %2, %2, #16 \n" 67 "vsubl.u8 q2, d0, d2 \n" 68 "vsubl.u8 q3, d1, d3 \n" 69 "vmlal.s16 q8, d4, d4 \n" 70 "vmlal.s16 q9, d6, d6 \n" 71 "vmlal.s16 q10, d5, d5 \n" 72 "vmlal.s16 q11, d7, d7 \n" 73 "bgt 1b \n" 77 74 78 "vadd.u32 q8, q8, q9 \n" 79 "vadd.u32 q10, q10, q11 \n" 80 "vadd.u32 q11, q8, q10 \n" 81 "vpaddl.u32 q1, q11 \n" 82 "vadd.u64 d0, d2, d3 \n" 83 "vmov.32 %3, d0[0] \n" 84 : "+r"(src_a), 85 "+r"(src_b), 86 "+r"(count), 87 "=r"(sse) 88 : 89 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); 75 "vadd.u32 q8, q8, q9 \n" 76 "vadd.u32 q10, q10, q11 \n" 77 "vadd.u32 q11, q8, q10 \n" 78 "vpaddl.u32 q1, q11 \n" 79 "vadd.u64 d0, d2, d3 \n" 80 "vmov.32 %3, d0[0] \n" 81 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) 82 : 83 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); 90 84 return sse; 91 85 } -
pjproject/trunk/third_party/yuv/source/compare_neon64.cc
r5633 r5699 25 25 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { 26 26 uint32 diff; 27 asm volatile 28 "movi v4.8h, #0 \n"27 asm volatile( 28 "movi v4.8h, #0 \n" 29 29 30 "1:\n"31 "ld1 {v0.16b, v1.16b}, [%0], #32 \n"32 "ld1 {v2.16b, v3.16b}, [%1], #32 \n"33 "eor v0.16b, v0.16b, v2.16b \n"34 "eor v1.16b, v1.16b, v3.16b \n"35 "cnt v0.16b, v0.16b \n"36 "cnt v1.16b, v1.16b \n"37 "subs %w2, %w2, #32 \n"38 "add v0.16b, v0.16b, v1.16b \n"39 "uadalp v4.8h, v0.16b \n"40 "b.gt 1b \n"30 "1: \n" 31 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" 32 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" 33 "eor v0.16b, v0.16b, v2.16b \n" 34 "eor v1.16b, v1.16b, v3.16b \n" 35 "cnt v0.16b, v0.16b \n" 36 "cnt v1.16b, v1.16b \n" 37 "subs %w2, %w2, #32 \n" 38 "add v0.16b, v0.16b, v1.16b \n" 39 "uadalp v4.8h, v0.16b \n" 40 "b.gt 1b \n" 41 41 42 "uaddlv s4, v4.8h \n" 43 "fmov %w3, s4 \n" 44 : "+r"(src_a), 45 "+r"(src_b), 46 "+r"(count), 47 "=r"(diff) 48 : 49 : "cc", "v0", "v1", "v2", "v3", "v4"); 42 "uaddlv s4, v4.8h \n" 43 "fmov %w3, s4 \n" 44 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) 45 : 46 : "cc", "v0", "v1", "v2", "v3", "v4"); 50 47 return diff; 51 48 } … … 53 50 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { 54 51 uint32 sse; 55 asm volatile 56 "eor v16.16b, v16.16b, v16.16b \n"57 "eor v18.16b, v18.16b, v18.16b \n"58 "eor v17.16b, v17.16b, v17.16b \n"59 "eor v19.16b, v19.16b, v19.16b \n"52 asm volatile( 53 "eor v16.16b, v16.16b, v16.16b \n" 54 "eor v18.16b, v18.16b, v18.16b \n" 55 "eor v17.16b, v17.16b, v17.16b \n" 56 "eor v19.16b, v19.16b, v19.16b \n" 60 57 61 "1:\n"62 "ld1 {v0.16b}, [%0], #16 \n"63 "ld1 {v1.16b}, [%1], #16 \n"64 "subs %w2, %w2, #16 \n"65 "usubl v2.8h, v0.8b, v1.8b \n"66 "usubl2 v3.8h, v0.16b, v1.16b \n"67 "smlal v16.4s, v2.4h, v2.4h \n"68 "smlal v17.4s, v3.4h, v3.4h \n"69 "smlal2 v18.4s, v2.8h, v2.8h \n"70 "smlal2 v19.4s, v3.8h, v3.8h \n"71 "b.gt 1b \n"58 "1: \n" 59 "ld1 {v0.16b}, [%0], #16 \n" 60 "ld1 {v1.16b}, [%1], #16 \n" 61 "subs %w2, %w2, #16 \n" 62 "usubl v2.8h, v0.8b, v1.8b \n" 63 "usubl2 v3.8h, v0.16b, v1.16b \n" 64 "smlal v16.4s, v2.4h, v2.4h \n" 65 "smlal v17.4s, v3.4h, v3.4h \n" 66 "smlal2 v18.4s, v2.8h, v2.8h \n" 67 "smlal2 v19.4s, v3.8h, v3.8h \n" 68 "b.gt 1b \n" 72 69 73 "add v16.4s, v16.4s, v17.4s \n" 74 "add v18.4s, v18.4s, v19.4s \n" 75 "add v19.4s, v16.4s, v18.4s \n" 76 "addv s0, v19.4s \n" 77 "fmov %w3, s0 \n" 78 : "+r"(src_a), 79 "+r"(src_b), 80 "+r"(count), 81 "=r"(sse) 82 : 83 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 70 "add v16.4s, v16.4s, v17.4s \n" 71 "add v18.4s, v18.4s, v19.4s \n" 72 "add v19.4s, v16.4s, v18.4s \n" 73 "addv s0, v19.4s \n" 74 "fmov %w3, s0 \n" 75 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) 76 : 77 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 84 78 return sse; 85 79 } -
pjproject/trunk/third_party/yuv/source/compare_win.cc
r5358 r5699 10 10 11 11 #include "libyuv/basic_types.h" 12 13 #include "libyuv/compare_row.h" 12 14 #include "libyuv/row.h" 15 16 #if defined(_MSC_VER) 17 #include <intrin.h> // For __popcnt 18 #endif 13 19 14 20 #ifdef __cplusplus … … 17 23 #endif 18 24 25 // This module is for 32 bit Visual C x86 and clangcl 19 26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 #if (_MSC_VER >= 1900) 21 __declspec(naked) 22 #else 23 __declspec(naked) __declspec(align(16)) 24 #endif 25 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 26 __asm { 27 mov eax, [esp + 4] // src_a 28 mov edx, [esp + 8] // src_b 29 mov ecx, [esp + 12] // count 27 28 uint32 HammingDistance_SSE42(const uint8* src_a, 29 const uint8* src_b, 30 int count) { 31 uint32 diff = 0u; 32 33 int i; 34 for (i = 0; i < count - 3; i += 4) { 35 uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b); 36 src_a += 4; 37 src_b += 4; 38 diff += __popcnt(x); 39 } 40 return diff; 41 } 42 43 __declspec(naked) uint32 44 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 45 __asm { 46 mov eax, [esp + 4] // src_a 47 mov edx, [esp + 8] // src_b 48 mov ecx, [esp + 12] // count 30 49 pxor xmm0, xmm0 31 50 pxor xmm5, xmm5 32 51 33 align 4 34 wloop: 35 movdqa xmm1, [eax] 52 wloop: 53 movdqu xmm1, [eax] 36 54 lea eax, [eax + 16] 37 movdq axmm2, [edx]55 movdqu xmm2, [edx] 38 56 lea edx, [edx + 16] 39 sub ecx, 1640 57 movdqa xmm3, xmm1 // abs trick 41 58 psubusb xmm1, xmm2 … … 49 66 paddd xmm0, xmm1 50 67 paddd xmm0, xmm2 68 sub ecx, 16 51 69 jg wloop 52 70 … … 63 81 #if _MSC_VER >= 1700 64 82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 65 #pragma warning(disable: 4752) 66 #if (_MSC_VER >= 1900) 67 __declspec(naked) 68 #else 69 __declspec(naked) __declspec(align(16)) 70 #endif 71 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 72 __asm { 73 mov eax, [esp + 4] // src_a 74 mov edx, [esp + 8] // src_b 75 mov ecx, [esp + 12] // count 83 #pragma warning(disable : 4752) 84 __declspec(naked) uint32 85 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 86 __asm { 87 mov eax, [esp + 4] // src_a 88 mov edx, [esp + 8] // src_b 89 mov ecx, [esp + 12] // count 76 90 vpxor ymm0, ymm0, ymm0 // sum 77 91 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 78 92 sub edx, eax 79 93 80 align 481 94 wloop: 82 95 vmovdqu ymm1, [eax] 83 96 vmovdqu ymm2, [eax + edx] 84 97 lea eax, [eax + 32] 85 sub ecx, 3286 98 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 87 99 vpsubusb ymm2, ymm2, ymm1 … … 93 105 vpaddd ymm0, ymm0, ymm1 94 106 vpaddd ymm0, ymm0, ymm2 107 sub ecx, 32 95 108 jg wloop 96 109 … … 108 121 #endif // _MSC_VER >= 1700 109 122 110 #define HAS_HASHDJB2_SSE41 111 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 112 static uvec32 kHashMul0 = { 113 0x0c3525e1, // 33 ^ 15 114 0xa3476dc1, // 33 ^ 14 115 0x3b4039a1, // 33 ^ 13 116 0x4f5f0981, // 33 ^ 12 117 }; 118 static uvec32 kHashMul1 = { 119 0x30f35d61, // 33 ^ 11 120 0x855cb541, // 33 ^ 10 121 0x040a9121, // 33 ^ 9 122 0x747c7101, // 33 ^ 8 123 }; 124 static uvec32 kHashMul2 = { 125 0xec41d4e1, // 33 ^ 7 126 0x4cfa3cc1, // 33 ^ 6 127 0x025528a1, // 33 ^ 5 128 0x00121881, // 33 ^ 4 129 }; 130 static uvec32 kHashMul3 = { 131 0x00008c61, // 33 ^ 3 132 0x00000441, // 33 ^ 2 133 0x00000021, // 33 ^ 1 134 0x00000001, // 33 ^ 0 135 }; 136 137 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 138 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 139 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 140 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 141 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 142 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 143 _asm _emit 0x40 _asm _emit reg 144 145 #if (_MSC_VER >= 1900) 146 __declspec(naked) 147 #else 148 __declspec(naked) __declspec(align(16)) 149 #endif 150 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 124 uvec32 kHashMul0 = { 125 0x0c3525e1, // 33 ^ 15 126 0xa3476dc1, // 33 ^ 14 127 0x3b4039a1, // 33 ^ 13 128 0x4f5f0981, // 33 ^ 12 129 }; 130 uvec32 kHashMul1 = { 131 0x30f35d61, // 33 ^ 11 132 0x855cb541, // 33 ^ 10 133 0x040a9121, // 33 ^ 9 134 0x747c7101, // 33 ^ 8 135 }; 136 uvec32 kHashMul2 = { 137 0xec41d4e1, // 33 ^ 7 138 0x4cfa3cc1, // 33 ^ 6 139 0x025528a1, // 33 ^ 5 140 0x00121881, // 33 ^ 4 141 }; 142 uvec32 kHashMul3 = { 143 0x00008c61, // 33 ^ 3 144 0x00000441, // 33 ^ 2 145 0x00000021, // 33 ^ 1 146 0x00000001, // 33 ^ 0 147 }; 148 149 __declspec(naked) uint32 150 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 154 154 movd xmm0, [esp + 12] // seed 155 155 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, kHash16x33 158 159 align 4 160 wloop: 161 movdqu xmm1, [eax] // src[0-15] 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, xmmword ptr kHash16x33 158 159 wloop: 160 movdqu xmm1, [eax] // src[0-15] 162 161 lea eax, [eax + 16] 163 pmulld (0xc6) // pmulld xmm0,xmm6hash *= 33 ^ 16164 movdqa xmm5, kHashMul0162 pmulld xmm0, xmm6 // hash *= 33 ^ 16 163 movdqa xmm5, xmmword ptr kHashMul0 165 164 movdqa xmm2, xmm1 166 punpcklbw xmm2, xmm7 165 punpcklbw xmm2, xmm7 // src[0-7] 167 166 movdqa xmm3, xmm2 168 punpcklwd xmm3, xmm7 169 pmulld (0xdd) // pmulldxmm3, xmm5170 movdqa xmm5, kHashMul1167 punpcklwd xmm3, xmm7 // src[0-3] 168 pmulld xmm3, xmm5 169 movdqa xmm5, xmmword ptr kHashMul1 171 170 movdqa xmm4, xmm2 172 punpckhwd xmm4, xmm7 173 pmulld (0xe5) // pmulldxmm4, xmm5174 movdqa xmm5, kHashMul2175 punpckhbw xmm1, xmm7 171 punpckhwd xmm4, xmm7 // src[4-7] 172 pmulld xmm4, xmm5 173 movdqa xmm5, xmmword ptr kHashMul2 174 punpckhbw xmm1, xmm7 // src[8-15] 176 175 movdqa xmm2, xmm1 177 punpcklwd xmm2, xmm7 178 pmulld (0xd5) // pmulldxmm2, xmm5179 movdqa xmm5, kHashMul3180 punpckhwd xmm1, xmm7 181 pmulld (0xcd) // pmulldxmm1, xmm5182 paddd xmm3, xmm4 176 punpcklwd xmm2, xmm7 // src[8-11] 177 pmulld xmm2, xmm5 178 movdqa xmm5, xmmword ptr kHashMul3 179 punpckhwd xmm1, xmm7 // src[12-15] 180 pmulld xmm1, xmm5 181 paddd xmm3, xmm4 // add 16 results 183 182 paddd xmm1, xmm2 184 sub ecx, 16185 183 paddd xmm1, xmm3 186 184 … … 190 188 paddd xmm1, xmm2 191 189 paddd xmm0, xmm1 192 jg wloop 193 194 movd eax, xmm0 // return hash 190 sub ecx, 16 191 jg wloop 192 193 movd eax, xmm0 // return hash 195 194 ret 196 195 } … … 199 198 // Visual C 2012 required for AVX2. 200 199 #if _MSC_VER >= 1700 201 #if (_MSC_VER >= 1900) 202 __declspec(naked) 203 #else 204 __declspec(naked) __declspec(align(16)) 205 #endif 206 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 207 __asm { 208 mov eax, [esp + 4] // src 209 mov ecx, [esp + 8] // count 210 movd xmm0, [esp + 12] // seed 211 movdqa xmm6, kHash16x33 212 213 align 4 214 wloop: 215 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] 216 pmulld xmm0, xmm6 // hash *= 33 ^ 16 217 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] 218 pmulld xmm3, kHashMul0 219 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] 220 pmulld xmm4, kHashMul1 221 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] 222 pmulld xmm2, kHashMul2 200 __declspec(naked) uint32 201 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 202 __asm { 203 mov eax, [esp + 4] // src 204 mov ecx, [esp + 8] // count 205 vmovd xmm0, [esp + 12] // seed 206 207 wloop: 208 vpmovzxbd xmm3, [eax] // src[0-3] 209 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 210 vpmovzxbd xmm4, [eax + 4] // src[4-7] 211 vpmulld xmm3, xmm3, xmmword ptr kHashMul0 212 vpmovzxbd xmm2, [eax + 8] // src[8-11] 213 vpmulld xmm4, xmm4, xmmword ptr kHashMul1 214 vpmovzxbd xmm1, [eax + 12] // src[12-15] 215 vpmulld xmm2, xmm2, xmmword ptr kHashMul2 223 216 lea eax, [eax + 16] 224 pmulld xmm1, kHashMul3 225 paddd xmm3, xmm4 // add 16 results 226 paddd xmm1, xmm2 217 vpmulld xmm1, xmm1, xmmword ptr kHashMul3 218 vpaddd xmm3, xmm3, xmm4 // add 16 results 219 vpaddd xmm1, xmm1, xmm2 220 vpaddd xmm1, xmm1, xmm3 221 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords 222 vpaddd xmm1, xmm1,xmm2 223 vpshufd xmm2, xmm1, 0x01 224 vpaddd xmm1, xmm1, xmm2 225 vpaddd xmm0, xmm0, xmm1 227 226 sub ecx, 16 228 paddd xmm1, xmm3 229 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 230 paddd xmm1, xmm2 231 pshufd xmm2, xmm1, 0x01 232 paddd xmm1, xmm2 233 paddd xmm0, xmm1 234 jg wloop 235 236 movd eax, xmm0 // return hash 227 jg wloop 228 229 vmovd eax, xmm0 // return hash 230 vzeroupper 237 231 ret 238 232 } … … 240 234 #endif // _MSC_VER >= 1700 241 235 242 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)236 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 243 237 244 238 #ifdef __cplusplus -
pjproject/trunk/third_party/yuv/source/convert_from.cc
r5633 r5699 658 658 } 659 659 660 // Convert H420 to RGB24. 661 LIBYUV_API 662 int H420ToRGB24(const uint8* src_y, 663 int src_stride_y, 664 const uint8* src_u, 665 int src_stride_u, 666 const uint8* src_v, 667 int src_stride_v, 668 uint8* dst_rgb24, 669 int dst_stride_rgb24, 670 int width, 671 int height) { 672 return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, 673 src_stride_v, dst_rgb24, dst_stride_rgb24, 674 &kYuvH709Constants, width, height); 675 } 676 677 // Convert H420 to RAW. 678 LIBYUV_API 679 int H420ToRAW(const uint8* src_y, 680 int src_stride_y, 681 const uint8* src_u, 682 int src_stride_u, 683 const uint8* src_v, 684 int src_stride_v, 685 uint8* dst_raw, 686 int dst_stride_raw, 687 int width, 688 int height) { 689 return I420ToRGB24Matrix(src_y, src_stride_y, src_v, 690 src_stride_v, // Swap U and V 691 src_u, src_stride_u, dst_raw, dst_stride_raw, 692 &kYvuH709Constants, // Use Yvu matrix 693 width, height); 694 } 695 660 696 // Convert I420 to ARGB1555. 661 697 LIBYUV_API … … 1076 1112 I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); 1077 1113 ARGBToRGB565DitherRow(row_argb, dst_rgb565, 1078 *(uint32*)(dither4x4 + ((y & 3) << 2)), 1079 width); // NOLINT1114 *(uint32*)(dither4x4 + ((y & 3) << 2)), // NOLINT 1115 width); // NOLINT 1080 1116 dst_rgb565 += dst_stride_rgb565; 1081 1117 src_y += src_stride_y; -
pjproject/trunk/third_party/yuv/source/cpu_id.cc
r5633 r5699 125 125 int xcr0 = 0; 126 126 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) 127 xcr0 = _xgetbv(0); // VS2010 SP1 required.127 xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT 128 128 #elif defined(__i386__) || defined(__x86_64__) 129 129 asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); … … 243 243 // Detect AVX512bw 244 244 if ((GetXCR0() & 0xe0) == 0xe0) { 245 cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; 246 } 247 } 248 245 cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; 246 cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; 247 cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; 248 cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; 249 cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; 250 cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; 251 cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; 252 } 253 } 254 255 // TODO(fbarchard): Consider moving these to gtest 249 256 // Environment variable overrides for testing. 250 257 if (TestEnv("LIBYUV_DISABLE_X86")) { … … 275 282 cpu_info &= ~kCpuHasFMA3; 276 283 } 277 if (TestEnv("LIBYUV_DISABLE_AVX3")) {278 cpu_info &= ~kCpuHasAVX3;279 }280 284 if (TestEnv("LIBYUV_DISABLE_F16C")) { 281 285 cpu_info &= ~kCpuHasF16C; 286 } 287 if (TestEnv("LIBYUV_DISABLE_AVX512BW")) { 288 cpu_info &= ~kCpuHasAVX512BW; 282 289 } 283 290 -
pjproject/trunk/third_party/yuv/source/mjpeg_decoder.cc
r5633 r5699 13 13 #ifdef HAVE_JPEG 14 14 #include <assert.h> 15 16 #ifdef __cplusplus17 #include <new>18 #endif19 15 20 16 #if !defined(__pnacl__) && !defined(__CLR_VER) && \ -
pjproject/trunk/third_party/yuv/source/mjpeg_validate.cc
r5633 r5699 25 25 while (it < end) { 26 26 // TODO(fbarchard): scan for 0xd9 instead. 27 it = static_cast<const uint8*>(memchr(it, 0xff, end - it));27 it = (const uint8*)(memchr(it, 0xff, end - it)); 28 28 if (it == NULL) { 29 29 break; -
pjproject/trunk/third_party/yuv/source/planar_functions.cc
r5633 r5699 322 322 } 323 323 #endif 324 #if defined(HAS_SPLITUVROW_MSA) 325 if (TestCpuFlag(kCpuHasMSA)) { 326 SplitUVRow = SplitUVRow_Any_MSA; 327 if (IS_ALIGNED(width, 32)) { 328 SplitUVRow = SplitUVRow_MSA; 329 } 330 } 331 #endif 324 332 325 333 for (y = 0; y < height; ++y) { … … 397 405 src_v += src_stride_v; 398 406 dst_uv += dst_stride_uv; 407 } 408 } 409 410 // Support function for NV12 etc RGB channels. 411 // Width and height are plane sizes (typically half pixel width). 412 LIBYUV_API 413 void SplitRGBPlane(const uint8* src_rgb, 414 int src_stride_rgb, 415 uint8* dst_r, 416 int dst_stride_r, 417 uint8* dst_g, 418 int dst_stride_g, 419 uint8* dst_b, 420 int dst_stride_b, 421 int width, 422 int height) { 423 int y; 424 void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g, 425 uint8* dst_b, int width) = SplitRGBRow_C; 426 // Negative height means invert the image. 427 if (height < 0) { 428 height = -height; 429 dst_r = dst_r + (height - 1) * dst_stride_r; 430 dst_g = dst_g + (height - 1) * dst_stride_g; 431 dst_b = dst_b + (height - 1) * dst_stride_b; 432 dst_stride_r = -dst_stride_r; 433 dst_stride_g = -dst_stride_g; 434 dst_stride_b = -dst_stride_b; 435 } 436 // Coalesce rows. 437 if (src_stride_rgb == width * 3 && dst_stride_r == width && 438 dst_stride_g == width && dst_stride_b == width) { 439 width *= height; 440 height = 1; 441 src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; 442 } 443 #if defined(HAS_SPLITRGBROW_SSSE3) 444 if (TestCpuFlag(kCpuHasSSSE3)) { 445 SplitRGBRow = SplitRGBRow_Any_SSSE3; 446 if (IS_ALIGNED(width, 16)) { 447 SplitRGBRow = SplitRGBRow_SSSE3; 448 } 449 } 450 #endif 451 #if defined(HAS_SPLITRGBROW_NEON) 452 if (TestCpuFlag(kCpuHasNEON)) { 453 SplitRGBRow = SplitRGBRow_Any_NEON; 454 if (IS_ALIGNED(width, 16)) { 455 SplitRGBRow = SplitRGBRow_NEON; 456 } 457 } 458 #endif 459 460 for (y = 0; y < height; ++y) { 461 // Copy a row of RGB. 462 SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); 463 dst_r += dst_stride_r; 464 dst_g += dst_stride_g; 465 dst_b += dst_stride_b; 466 src_rgb += src_stride_rgb; 467 } 468 } 469 470 LIBYUV_API 471 void MergeRGBPlane(const uint8* src_r, 472 int src_stride_r, 473 const uint8* src_g, 474 int src_stride_g, 475 const uint8* src_b, 476 int src_stride_b, 477 uint8* dst_rgb, 478 int dst_stride_rgb, 479 int width, 480 int height) { 481 int y; 482 void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g, 483 const uint8* src_b, uint8* dst_rgb, int width) = 484 MergeRGBRow_C; 485 // Coalesce rows. 486 // Negative height means invert the image. 487 if (height < 0) { 488 height = -height; 489 dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; 490 dst_stride_rgb = -dst_stride_rgb; 491 } 492 // Coalesce rows. 493 if (src_stride_r == width && src_stride_g == width && src_stride_b == width && 494 dst_stride_rgb == width * 3) { 495 width *= height; 496 height = 1; 497 src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; 498 } 499 #if defined(HAS_MERGERGBROW_SSSE3) 500 if (TestCpuFlag(kCpuHasSSSE3)) { 501 MergeRGBRow = MergeRGBRow_Any_SSSE3; 502 if (IS_ALIGNED(width, 16)) { 503 MergeRGBRow = MergeRGBRow_SSSE3; 504 } 505 } 506 #endif 507 #if defined(HAS_MERGERGBROW_NEON) 508 if (TestCpuFlag(kCpuHasNEON)) { 509 MergeRGBRow = MergeRGBRow_Any_NEON; 510 if (IS_ALIGNED(width, 16)) { 511 MergeRGBRow = MergeRGBRow_NEON; 512 } 513 } 514 #endif 515 516 for (y = 0; y < height; ++y) { 517 // Merge a row of U and V into a row of RGB. 518 MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); 519 src_r += src_stride_r; 520 src_g += src_stride_g; 521 src_b += src_stride_b; 522 dst_rgb += dst_stride_rgb; 399 523 } 400 524 } … … 845 969 if (TestCpuFlag(kCpuHasNEON)) { 846 970 ARGBBlendRow = ARGBBlendRow_NEON; 971 } 972 #endif 973 #if defined(HAS_ARGBBLENDROW_MSA) 974 if (TestCpuFlag(kCpuHasMSA)) { 975 ARGBBlendRow = ARGBBlendRow_MSA; 847 976 } 848 977 #endif … … 1575 1704 } 1576 1705 #endif 1706 #if defined(HAS_SETROW_MSA) 1707 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { 1708 SetRow = SetRow_MSA; 1709 } 1710 #endif 1577 1711 1578 1712 // Set plane … … 1975 2109 } 1976 2110 #endif 2111 #if defined(HAS_ARGBCOLORMATRIXROW_MSA) 2112 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { 2113 ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; 2114 } 2115 #endif 1977 2116 for (y = 0; y < height; ++y) { 1978 2117 ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); … … 2133 2272 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { 2134 2273 ARGBQuantizeRow = ARGBQuantizeRow_NEON; 2274 } 2275 #endif 2276 #if defined(HAS_ARGBQUANTIZEROW_MSA) 2277 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { 2278 ARGBQuantizeRow = ARGBQuantizeRow_MSA; 2135 2279 } 2136 2280 #endif … … 2620 2764 } 2621 2765 #endif 2766 #if defined(HAS_SOBELYROW_MSA) 2767 if (TestCpuFlag(kCpuHasMSA)) { 2768 SobelYRow = SobelYRow_MSA; 2769 } 2770 #endif 2622 2771 #if defined(HAS_SOBELXROW_SSE2) 2623 2772 if (TestCpuFlag(kCpuHasSSE2)) { … … 2628 2777 if (TestCpuFlag(kCpuHasNEON)) { 2629 2778 SobelXRow = SobelXRow_NEON; 2779 } 2780 #endif 2781 #if defined(HAS_SOBELXROW_MSA) 2782 if (TestCpuFlag(kCpuHasMSA)) { 2783 SobelXRow = SobelXRow_MSA; 2630 2784 } 2631 2785 #endif … … 2904 3058 } 2905 3059 #endif 3060 #if defined(HAS_HALFFLOATROW_MSA) 3061 if (TestCpuFlag(kCpuHasMSA)) { 3062 HalfFloatRow = HalfFloatRow_Any_MSA; 3063 if (IS_ALIGNED(width, 32)) { 3064 HalfFloatRow = HalfFloatRow_MSA; 3065 } 3066 } 3067 #endif 2906 3068 2907 3069 for (y = 0; y < height; ++y) { … … 3047 3209 ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON 3048 3210 : ARGBExtractAlphaRow_Any_NEON; 3211 } 3212 #endif 3213 #if defined(HAS_ARGBEXTRACTALPHAROW_MSA) 3214 if (TestCpuFlag(kCpuHasMSA)) { 3215 ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA 3216 : ARGBExtractAlphaRow_Any_MSA; 3049 3217 } 3050 3218 #endif … … 3158 3326 if (IS_ALIGNED(width, 16)) { 3159 3327 SplitUVRow = SplitUVRow_NEON; 3328 } 3329 } 3330 #endif 3331 #if defined(HAS_SPLITUVROW_MSA) 3332 if (TestCpuFlag(kCpuHasMSA)) { 3333 SplitUVRow = SplitUVRow_Any_MSA; 3334 if (IS_ALIGNED(width, 32)) { 3335 SplitUVRow = SplitUVRow_MSA; 3160 3336 } 3161 3337 } … … 3269 3445 } 3270 3446 #endif 3447 #if defined(HAS_SPLITUVROW_MSA) 3448 if (TestCpuFlag(kCpuHasMSA)) { 3449 SplitUVRow = SplitUVRow_Any_MSA; 3450 if (IS_ALIGNED(width, 32)) { 3451 SplitUVRow = SplitUVRow_MSA; 3452 } 3453 } 3454 #endif 3271 3455 #if defined(HAS_INTERPOLATEROW_SSSE3) 3272 3456 if (TestCpuFlag(kCpuHasSSSE3)) { -
pjproject/trunk/third_party/yuv/source/rotate.cc
r5633 r5699 360 360 IS_ALIGNED(src_stride, 4)) { 361 361 MirrorUVRow = MirrorUVRow_DSPR2; 362 } 363 #endif 364 #if defined(HAS_MIRRORUVROW_MSA) 365 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { 366 MirrorUVRow = MirrorUVRow_MSA; 362 367 } 363 368 #endif -
pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
r5633 r5699 31 31 int width) { 32 32 const uint8* src_temp; 33 asm volatile 34 // loops are on blocks of 8. loop will stop when35 // counter gets to or below 0. starting the counter36 // at w-8 allow for this37 "sub %w3, %w3, #8 \n"38 39 // handle 8x8 blocks. this should be the majority of the plane40 "1: \n"33 asm volatile( 34 // loops are on blocks of 8. loop will stop when 35 // counter gets to or below 0. starting the counter 36 // at w-8 allow for this 37 "sub %w3, %w3, #8 \n" 38 39 // handle 8x8 blocks. this should be the majority of the plane 40 "1: \n" 41 41 "mov %0, %1 \n" 42 42 … … 93 93 "b.ge 1b \n" 94 94 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23" 197 ); 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23"); 198 197 } 199 198 … … 210 209 int width) { 211 210 const uint8* src_temp; 212 asm volatile ( 213 // loops are on blocks of 8. loop will stop when 214 // counter gets to or below 0. starting the counter 215 // at w-8 allow for this 216 "sub %w4, %w4, #8 \n" 217 218 // handle 8x8 blocks. this should be the majority of the plane 219 "1: \n" 220 "mov %0, %1 \n" 221 222 "ld1 {v0.16b}, [%0], %5 \n" 223 "ld1 {v1.16b}, [%0], %5 \n" 224 "ld1 {v2.16b}, [%0], %5 \n" 225 "ld1 {v3.16b}, [%0], %5 \n" 226 "ld1 {v4.16b}, [%0], %5 \n" 227 "ld1 {v5.16b}, [%0], %5 \n" 228 "ld1 {v6.16b}, [%0], %5 \n" 229 "ld1 {v7.16b}, [%0] \n" 230 231 "trn1 v16.16b, v0.16b, v1.16b \n" 232 "trn2 v17.16b, v0.16b, v1.16b \n" 233 "trn1 v18.16b, v2.16b, v3.16b \n" 234 "trn2 v19.16b, v2.16b, v3.16b \n" 235 "trn1 v20.16b, v4.16b, v5.16b \n" 236 "trn2 v21.16b, v4.16b, v5.16b \n" 237 "trn1 v22.16b, v6.16b, v7.16b \n" 238 "trn2 v23.16b, v6.16b, v7.16b \n" 239 240 "trn1 v0.8h, v16.8h, v18.8h \n" 241 "trn2 v1.8h, v16.8h, v18.8h \n" 242 "trn1 v2.8h, v20.8h, v22.8h \n" 243 "trn2 v3.8h, v20.8h, v22.8h \n" 244 "trn1 v4.8h, v17.8h, v19.8h \n" 245 "trn2 v5.8h, v17.8h, v19.8h \n" 246 "trn1 v6.8h, v21.8h, v23.8h \n" 247 "trn2 v7.8h, v21.8h, v23.8h \n" 248 249 "trn1 v16.4s, v0.4s, v2.4s \n" 250 "trn2 v17.4s, v0.4s, v2.4s \n" 251 "trn1 v18.4s, v1.4s, v3.4s \n" 252 "trn2 v19.4s, v1.4s, v3.4s \n" 253 "trn1 v20.4s, v4.4s, v6.4s \n" 254 "trn2 v21.4s, v4.4s, v6.4s \n" 255 "trn1 v22.4s, v5.4s, v7.4s \n" 256 "trn2 v23.4s, v5.4s, v7.4s \n" 257 258 "mov %0, %2 \n" 259 260 "st1 {v16.d}[0], [%0], %6 \n" 261 "st1 {v18.d}[0], [%0], %6 \n" 262 "st1 {v17.d}[0], [%0], %6 \n" 263 "st1 {v19.d}[0], [%0], %6 \n" 264 "st1 {v16.d}[1], [%0], %6 \n" 265 "st1 {v18.d}[1], [%0], %6 \n" 266 "st1 {v17.d}[1], [%0], %6 \n" 267 "st1 {v19.d}[1], [%0] \n" 268 269 "mov %0, %3 \n" 270 271 "st1 {v20.d}[0], [%0], %7 \n" 272 "st1 {v22.d}[0], [%0], %7 \n" 273 "st1 {v21.d}[0], [%0], %7 \n" 274 "st1 {v23.d}[0], [%0], %7 \n" 275 "st1 {v20.d}[1], [%0], %7 \n" 276 "st1 {v22.d}[1], [%0], %7 \n" 277 "st1 {v21.d}[1], [%0], %7 \n" 278 "st1 {v23.d}[1], [%0] \n" 279 280 "add %1, %1, #16 \n" // src += 8*2 281 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b 283 "subs %w4, %w4, #8 \n" // w -= 8 284 "b.ge 1b \n" 285 286 // add 8 back to counter. if the result is 0 there are 287 // no residuals. 288 "adds %w4, %w4, #8 \n" 289 "b.eq 4f \n" 290 291 // some residual, so between 1 and 7 lines left to transpose 292 "cmp %w4, #2 \n" 293 "b.lt 3f \n" 294 295 "cmp %w4, #4 \n" 296 "b.lt 2f \n" 297 298 // TODO(frkoenig): Clean this up 299 // 4x8 block 300 "mov %0, %1 \n" 301 "ld1 {v0.8b}, [%0], %5 \n" 302 "ld1 {v1.8b}, [%0], %5 \n" 303 "ld1 {v2.8b}, [%0], %5 \n" 304 "ld1 {v3.8b}, [%0], %5 \n" 305 "ld1 {v4.8b}, [%0], %5 \n" 306 "ld1 {v5.8b}, [%0], %5 \n" 307 "ld1 {v6.8b}, [%0], %5 \n" 308 "ld1 {v7.8b}, [%0] \n" 309 310 "ld1 {v30.16b}, [%8], #16 \n" 311 "ld1 {v31.16b}, [%8] \n" 312 313 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 314 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 315 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 316 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 317 318 "mov %0, %2 \n" 319 320 "st1 {v16.s}[0], [%0], %6 \n" 321 "st1 {v16.s}[1], [%0], %6 \n" 322 "st1 {v16.s}[2], [%0], %6 \n" 323 "st1 {v16.s}[3], [%0], %6 \n" 324 325 "add %0, %2, #4 \n" 326 "st1 {v18.s}[0], [%0], %6 \n" 327 "st1 {v18.s}[1], [%0], %6 \n" 328 "st1 {v18.s}[2], [%0], %6 \n" 329 "st1 {v18.s}[3], [%0] \n" 330 331 "mov %0, %3 \n" 332 333 "st1 {v17.s}[0], [%0], %7 \n" 334 "st1 {v17.s}[1], [%0], %7 \n" 335 "st1 {v17.s}[2], [%0], %7 \n" 336 "st1 {v17.s}[3], [%0], %7 \n" 337 338 "add %0, %3, #4 \n" 339 "st1 {v19.s}[0], [%0], %7 \n" 340 "st1 {v19.s}[1], [%0], %7 \n" 341 "st1 {v19.s}[2], [%0], %7 \n" 342 "st1 {v19.s}[3], [%0] \n" 343 344 "add %1, %1, #8 \n" // src += 4 * 2 345 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a 346 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b 347 "subs %w4, %w4, #4 \n" // w -= 4 348 "b.eq 4f \n" 349 350 // some residual, check to see if it includes a 2x8 block, 351 // or less 352 "cmp %w4, #2 \n" 353 "b.lt 3f \n" 354 355 // 2x8 block 356 "2: \n" 357 "mov %0, %1 \n" 358 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 359 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 360 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 361 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 362 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 363 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 364 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 365 "ld2 {v2.h, v3.h}[3], [%0] \n" 366 367 "trn1 v4.8b, v0.8b, v2.8b \n" 368 "trn2 v5.8b, v0.8b, v2.8b \n" 369 "trn1 v6.8b, v1.8b, v3.8b \n" 370 "trn2 v7.8b, v1.8b, v3.8b \n" 371 372 "mov %0, %2 \n" 373 374 "st1 {v4.d}[0], [%0], %6 \n" 375 "st1 {v6.d}[0], [%0] \n" 376 377 "mov %0, %3 \n" 378 379 "st1 {v5.d}[0], [%0], %7 \n" 380 "st1 {v7.d}[0], [%0] \n" 381 382 "add %1, %1, #4 \n" // src += 2 * 2 383 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a 384 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b 385 "subs %w4, %w4, #2 \n" // w -= 2 386 "b.eq 4f \n" 387 388 // 1x8 block 389 "3: \n" 390 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 391 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 392 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 393 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 394 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 395 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[7], [%1] \n" 398 399 "st1 {v0.d}[0], [%2] \n" 400 "st1 {v1.d}[0], [%3] \n" 401 402 "4: \n" 403 404 : "=&r"(src_temp), // %0 405 "+r"(src), // %1 406 "+r"(dst_a), // %2 407 "+r"(dst_b), // %3 408 "+r"(width) // %4 409 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 410 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 411 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 412 "r"(&kVTbl4x4TransposeDi) // %8 413 : "memory", "cc", 414 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 415 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", 416 "v30", "v31" 417 ); 211 asm volatile( 212 // loops are on blocks of 8. loop will stop when 213 // counter gets to or below 0. starting the counter 214 // at w-8 allow for this 215 "sub %w4, %w4, #8 \n" 216 217 // handle 8x8 blocks. this should be the majority of the plane 218 "1: \n" 219 "mov %0, %1 \n" 220 221 "ld1 {v0.16b}, [%0], %5 \n" 222 "ld1 {v1.16b}, [%0], %5 \n" 223 "ld1 {v2.16b}, [%0], %5 \n" 224 "ld1 {v3.16b}, [%0], %5 \n" 225 "ld1 {v4.16b}, [%0], %5 \n" 226 "ld1 {v5.16b}, [%0], %5 \n" 227 "ld1 {v6.16b}, [%0], %5 \n" 228 "ld1 {v7.16b}, [%0] \n" 229 230 "trn1 v16.16b, v0.16b, v1.16b \n" 231 "trn2 v17.16b, v0.16b, v1.16b \n" 232 "trn1 v18.16b, v2.16b, v3.16b \n" 233 "trn2 v19.16b, v2.16b, v3.16b \n" 234 "trn1 v20.16b, v4.16b, v5.16b \n" 235 "trn2 v21.16b, v4.16b, v5.16b \n" 236 "trn1 v22.16b, v6.16b, v7.16b \n" 237 "trn2 v23.16b, v6.16b, v7.16b \n" 238 239 "trn1 v0.8h, v16.8h, v18.8h \n" 240 "trn2 v1.8h, v16.8h, v18.8h \n" 241 "trn1 v2.8h, v20.8h, v22.8h \n" 242 "trn2 v3.8h, v20.8h, v22.8h \n" 243 "trn1 v4.8h, v17.8h, v19.8h \n" 244 "trn2 v5.8h, v17.8h, v19.8h \n" 245 "trn1 v6.8h, v21.8h, v23.8h \n" 246 "trn2 v7.8h, v21.8h, v23.8h \n" 247 248 "trn1 v16.4s, v0.4s, v2.4s \n" 249 "trn2 v17.4s, v0.4s, v2.4s \n" 250 "trn1 v18.4s, v1.4s, v3.4s \n" 251 "trn2 v19.4s, v1.4s, v3.4s \n" 252 "trn1 v20.4s, v4.4s, v6.4s \n" 253 "trn2 v21.4s, v4.4s, v6.4s \n" 254 "trn1 v22.4s, v5.4s, v7.4s \n" 255 "trn2 v23.4s, v5.4s, v7.4s \n" 256 257 "mov %0, %2 \n" 258 259 "st1 {v16.d}[0], [%0], %6 \n" 260 "st1 {v18.d}[0], [%0], %6 \n" 261 "st1 {v17.d}[0], [%0], %6 \n" 262 "st1 {v19.d}[0], [%0], %6 \n" 263 "st1 {v16.d}[1], [%0], %6 \n" 264 "st1 {v18.d}[1], [%0], %6 \n" 265 "st1 {v17.d}[1], [%0], %6 \n" 266 "st1 {v19.d}[1], [%0] \n" 267 268 "mov %0, %3 \n" 269 270 "st1 {v20.d}[0], [%0], %7 \n" 271 "st1 {v22.d}[0], [%0], %7 \n" 272 "st1 {v21.d}[0], [%0], %7 \n" 273 "st1 {v23.d}[0], [%0], %7 \n" 274 "st1 {v20.d}[1], [%0], %7 \n" 275 "st1 {v22.d}[1], [%0], %7 \n" 276 "st1 {v21.d}[1], [%0], %7 \n" 277 "st1 {v23.d}[1], [%0] \n" 278 279 "add %1, %1, #16 \n" // src += 8*2 280 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * 281 // dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * 283 // dst_stride_b 284 "subs %w4, %w4, #8 \n" // w -= 8 285 "b.ge 1b \n" 286 287 // add 8 back to counter. if the result is 0 there are 288 // no residuals. 289 "adds %w4, %w4, #8 \n" 290 "b.eq 4f \n" 291 292 // some residual, so between 1 and 7 lines left to transpose 293 "cmp %w4, #2 \n" 294 "b.lt 3f \n" 295 296 "cmp %w4, #4 \n" 297 "b.lt 2f \n" 298 299 // TODO(frkoenig): Clean this up 300 // 4x8 block 301 "mov %0, %1 \n" 302 "ld1 {v0.8b}, [%0], %5 \n" 303 "ld1 {v1.8b}, [%0], %5 \n" 304 "ld1 {v2.8b}, [%0], %5 \n" 305 "ld1 {v3.8b}, [%0], %5 \n" 306 "ld1 {v4.8b}, [%0], %5 \n" 307 "ld1 {v5.8b}, [%0], %5 \n" 308 "ld1 {v6.8b}, [%0], %5 \n" 309 "ld1 {v7.8b}, [%0] \n" 310 311 "ld1 {v30.16b}, [%8], #16 \n" 312 "ld1 {v31.16b}, [%8] \n" 313 314 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 315 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 316 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 317 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 318 319 "mov %0, %2 \n" 320 321 "st1 {v16.s}[0], [%0], %6 \n" 322 "st1 {v16.s}[1], [%0], %6 \n" 323 "st1 {v16.s}[2], [%0], %6 \n" 324 "st1 {v16.s}[3], [%0], %6 \n" 325 326 "add %0, %2, #4 \n" 327 "st1 {v18.s}[0], [%0], %6 \n" 328 "st1 {v18.s}[1], [%0], %6 \n" 329 "st1 {v18.s}[2], [%0], %6 \n" 330 "st1 {v18.s}[3], [%0] \n" 331 332 "mov %0, %3 \n" 333 334 "st1 {v17.s}[0], [%0], %7 \n" 335 "st1 {v17.s}[1], [%0], %7 \n" 336 "st1 {v17.s}[2], [%0], %7 \n" 337 "st1 {v17.s}[3], [%0], %7 \n" 338 339 "add %0, %3, #4 \n" 340 "st1 {v19.s}[0], [%0], %7 \n" 341 "st1 {v19.s}[1], [%0], %7 \n" 342 "st1 {v19.s}[2], [%0], %7 \n" 343 "st1 {v19.s}[3], [%0] \n" 344 345 "add %1, %1, #8 \n" // src += 4 * 2 346 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * 347 // dst_stride_a 348 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * 349 // dst_stride_b 350 "subs %w4, %w4, #4 \n" // w -= 4 351 "b.eq 4f \n" 352 353 // some residual, check to see if it includes a 2x8 block, 354 // or less 355 "cmp %w4, #2 \n" 356 "b.lt 3f \n" 357 358 // 2x8 block 359 "2: \n" 360 "mov %0, %1 \n" 361 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 362 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 363 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 364 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 365 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 366 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 367 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 368 "ld2 {v2.h, v3.h}[3], [%0] \n" 369 370 "trn1 v4.8b, v0.8b, v2.8b \n" 371 "trn2 v5.8b, v0.8b, v2.8b \n" 372 "trn1 v6.8b, v1.8b, v3.8b \n" 373 "trn2 v7.8b, v1.8b, v3.8b \n" 374 375 "mov %0, %2 \n" 376 377 "st1 {v4.d}[0], [%0], %6 \n" 378 "st1 {v6.d}[0], [%0] \n" 379 380 "mov %0, %3 \n" 381 382 "st1 {v5.d}[0], [%0], %7 \n" 383 "st1 {v7.d}[0], [%0] \n" 384 385 "add %1, %1, #4 \n" // src += 2 * 2 386 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * 387 // dst_stride_a 388 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * 389 // dst_stride_b 390 "subs %w4, %w4, #2 \n" // w -= 2 391 "b.eq 4f \n" 392 393 // 1x8 block 394 "3: \n" 395 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 398 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 399 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 400 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 401 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 402 "ld2 {v0.b, v1.b}[7], [%1] \n" 403 404 "st1 {v0.d}[0], [%2] \n" 405 "st1 {v1.d}[0], [%3] \n" 406 407 "4: \n" 408 409 : "=&r"(src_temp), // %0 410 "+r"(src), // %1 411 "+r"(dst_a), // %2 412 "+r"(dst_b), // %3 413 "+r"(width) // %4 414 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 415 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 416 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 417 "r"(&kVTbl4x4TransposeDi) // %8 418 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 419 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); 418 420 } 419 421 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -
pjproject/trunk/third_party/yuv/source/rotate_win.cc
r5633 r5699 18 18 19 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 21 21 22 22 __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, … … 173 173 lea eax, [eax + 8 * edi + 16] 174 174 neg edi 175 // Second round of bit swap.175 // Second round of bit swap. 176 176 movdqa xmm5, xmm0 177 177 punpcklwd xmm0, xmm2 … … 193 193 movdqa xmm7, xmm6 194 194 195 // Third round of bit swap.196 // Write to the destination pointer.195 // Third round of bit swap. 196 // Write to the destination pointer. 197 197 movdqa xmm6, xmm0 198 198 punpckldq xmm0, xmm4 -
pjproject/trunk/third_party/yuv/source/row_any.cc
r5633 r5699 85 85 SS(r, DUVSHIFT) * BPP); \ 86 86 } 87 88 // Merge functions. 89 #ifdef HAS_MERGERGBROW_SSSE3 90 ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) 91 #endif 92 #ifdef HAS_MERGERGBROW_NEON 93 ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) 94 #endif 87 95 #ifdef HAS_I422TOYUY2ROW_SSE2 88 96 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) … … 622 630 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) 623 631 #endif 632 #ifdef HAS_ARGBEXTRACTALPHAROW_MSA 633 ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) 634 #endif 624 635 #undef ANY11 625 636 … … 746 757 ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7) 747 758 ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7) 759 #endif 760 #ifdef HAS_HALFFLOATROW_MSA 761 ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31) 748 762 #endif 749 763 #undef ANY11P16 … … 912 926 ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) 913 927 #endif 928 #ifdef HAS_SPLITUVROW_MSA 929 ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) 930 #endif 914 931 #ifdef HAS_ARGBTOUV444ROW_SSSE3 915 932 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) … … 934 951 #endif 935 952 #undef ANY12 953 954 // Any 1 to 3. Outputs RGB planes. 955 #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ 956 void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \ 957 int width) { \ 958 SIMD_ALIGNED(uint8 temp[16 * 6]); \ 959 memset(temp, 0, 16 * 3); /* for msan */ \ 960 int r = width & MASK; \ 961 int n = width & ~MASK; \ 962 if (n > 0) { \ 963 ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ 964 } \ 965 memcpy(temp, src_ptr + n * BPP, r * BPP); \ 966 ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ 967 memcpy(dst_r + n, temp + 16 * 3, r); \ 968 memcpy(dst_g + n, temp + 16 * 4, r); \ 969 memcpy(dst_b + n, temp + 16 * 5, r); \ 970 } 971 972 #ifdef HAS_SPLITRGBROW_SSSE3 973 ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) 974 #endif 975 #ifdef HAS_SPLITRGBROW_NEON 976 ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) 977 #endif 936 978 937 979 // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. -
pjproject/trunk/third_party/yuv/source/row_common.cc
r5633 r5699 1771 1771 } 1772 1772 1773 void SplitRGBRow_C(const uint8* src_rgb, 1774 uint8* dst_r, 1775 uint8* dst_g, 1776 uint8* dst_b, 1777 int width) { 1778 int x; 1779 for (x = 0; x < width; ++x) { 1780 dst_r[x] = src_rgb[0]; 1781 dst_g[x] = src_rgb[1]; 1782 dst_b[x] = src_rgb[2]; 1783 src_rgb += 3; 1784 } 1785 } 1786 1787 void MergeRGBRow_C(const uint8* src_r, 1788 const uint8* src_g, 1789 const uint8* src_b, 1790 uint8* dst_rgb, 1791 int width) { 1792 int x; 1793 for (x = 0; x < width; ++x) { 1794 dst_rgb[0] = src_r[x]; 1795 dst_rgb[1] = src_g[x]; 1796 dst_rgb[2] = src_b[x]; 1797 dst_rgb += 3; 1798 } 1799 } 1800 1801 void MergeUVRow_16_C(const uint16* src_u, 1802 const uint16* src_v, 1803 uint16* dst_uv, 1804 int scale, 1805 int width) { 1806 int x; 1807 for (x = 0; x < width - 1; x += 2) { 1808 dst_uv[0] = src_u[x] * scale; 1809 dst_uv[1] = src_v[x] * scale; 1810 dst_uv[2] = src_u[x + 1] * scale; 1811 dst_uv[3] = src_v[x + 1] * scale; 1812 dst_uv += 4; 1813 } 1814 if (width & 1) { 1815 dst_uv[0] = src_u[width - 1] * scale; 1816 dst_uv[1] = src_v[width - 1] * scale; 1817 } 1818 } 1819 1820 void MultiplyRow_16_C(const uint16* src_y, 1821 uint16* dst_y, 1822 int scale, 1823 int width) { 1824 int x; 1825 for (x = 0; x < width; ++x) { 1826 dst_y[x] = src_y[x] * scale; 1827 } 1828 } 1829 1773 1830 void CopyRow_C(const uint8* src, uint8* dst, int count) { 1774 1831 memcpy(dst, src, count); … … 2640 2697 #endif 2641 2698 2699 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { 2700 float fsum = 0.f; 2701 int i; 2702 #if defined(__clang__) 2703 #pragma clang loop vectorize_width(4) 2704 #endif 2705 for (i = 0; i < width; ++i) { 2706 float v = *src++; 2707 fsum += v * v; 2708 *dst++ = v * scale; 2709 } 2710 return fsum; 2711 } 2712 2713 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { 2714 float fmax = 0.f; 2715 int i; 2716 for (i = 0; i < width; ++i) { 2717 float v = *src++; 2718 float vs = v * scale; 2719 fmax = (v > fmax) ? v : fmax; 2720 *dst++ = vs; 2721 } 2722 return fmax; 2723 } 2724 2725 void ScaleSamples_C(const float* src, float* dst, float scale, int width) { 2726 int i; 2727 for (i = 0; i < width; ++i) { 2728 *dst++ = *src++ * scale; 2729 } 2730 } 2731 2732 void GaussRow_C(const uint32* src, uint16* dst, int width) { 2733 int i; 2734 for (i = 0; i < width; ++i) { 2735 *dst++ = 2736 (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; 2737 ++src; 2738 } 2739 } 2740 2741 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2742 void GaussCol_C(const uint16* src0, 2743 const uint16* src1, 2744 const uint16* src2, 2745 const uint16* src3, 2746 const uint16* src4, 2747 uint32* dst, 2748 int width) { 2749 int i; 2750 for (i = 0; i < width; ++i) { 2751 *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; 2752 } 2753 } 2754 2642 2755 #ifdef __cplusplus 2643 2756 } // extern "C" -
pjproject/trunk/third_party/yuv/source/row_gcc.cc
r5633 r5699 39 39 127, -84, -43, 0, 127, -84, -43, 0}; 40 40 41 static vec8 kARGBToV = { 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 43 }; 41 static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, 42 -18, -94, 112, 0, -18, -94, 112, 0}; 44 43 45 44 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, … … 2754 2753 } 2755 2754 #endif // HAS_MERGEUVROW_SSE2 2755 2756 // Use scale to convert lsb formats to msb, depending how many bits there are: 2757 // 128 = 9 bits 2758 // 64 = 10 bits 2759 // 16 = 12 bits 2760 // 1 = 16 bits 2761 #ifdef HAS_MERGEUVROW_16_AVX2 2762 void MergeUVRow_16_AVX2(const uint16* src_u, 2763 const uint16* src_v, 2764 uint16* dst_uv, 2765 int scale, 2766 int width) { 2767 // clang-format off 2768 asm volatile ( 2769 "vmovd %4,%%xmm3 \n" 2770 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" 2771 "vbroadcastss %%xmm3,%%ymm3 \n" 2772 "sub %0,%1 \n" 2773 2774 // 16 pixels per loop. 2775 LABELALIGN 2776 "1: \n" 2777 "vmovdqu (%0),%%ymm0 \n" 2778 "vmovdqu (%0,%1,1),%%ymm1 \n" 2779 "add $0x20,%0 \n" 2780 2781 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" 2782 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" 2783 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates 2784 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" 2785 "vextractf128 $0x0,%%ymm2,(%2) \n" 2786 "vextractf128 $0x0,%%ymm0,0x10(%2) \n" 2787 "vextractf128 $0x1,%%ymm2,0x20(%2) \n" 2788 "vextractf128 $0x1,%%ymm0,0x30(%2) \n" 2789 "add $0x40,%2 \n" 2790 "sub $0x10,%3 \n" 2791 "jg 1b \n" 2792 "vzeroupper \n" 2793 : "+r"(src_u), // %0 2794 "+r"(src_v), // %1 2795 "+r"(dst_uv), // %2 2796 "+r"(width) // %3 2797 : "r"(scale) // %4 2798 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); 2799 // clang-format on 2800 } 2801 #endif // HAS_MERGEUVROW_AVX2 2802 2803 #ifdef HAS_MULTIPLYROW_16_AVX2 2804 void MultiplyRow_16_AVX2(const uint16* src_y, 2805 uint16* dst_y, 2806 int scale, 2807 int width) { 2808 // clang-format off 2809 asm volatile ( 2810 "vmovd %3,%%xmm3 \n" 2811 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" 2812 "vbroadcastss %%xmm3,%%ymm3 \n" 2813 "sub %0,%1 \n" 2814 2815 // 16 pixels per loop. 2816 LABELALIGN 2817 "1: \n" 2818 "vmovdqu (%0),%%ymm0 \n" 2819 "vmovdqu 0x20(%0),%%ymm1 \n" 2820 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" 2821 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" 2822 "vmovdqu %%ymm0,(%0,%1) \n" 2823 "vmovdqu %%ymm1,0x20(%0,%1) \n" 2824 "add $0x40,%0 \n" 2825 "sub $0x20,%2 \n" 2826 "jg 1b \n" 2827 "vzeroupper \n" 2828 : "+r"(src_y), // %0 2829 "+r"(dst_y), // %1 2830 "+r"(width) // %2 2831 : "r"(scale) // %3 2832 : "memory", "cc", "xmm0", "xmm1", "xmm3"); 2833 // clang-format on 2834 } 2835 #endif // HAS_MULTIPLYROW_16_AVX2 2836 2837 #ifdef HAS_SPLITRGBROW_SSSE3 2838 2839 // Shuffle table for converting RGB to Planar. 2840 static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, 2841 128u, 128u, 128u, 128u, 128u, 128u, 2842 128u, 128u, 128u, 128u}; 2843 static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, 2844 2u, 5u, 8u, 11u, 14u, 128u, 2845 128u, 128u, 128u, 128u}; 2846 static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, 2847 128u, 128u, 128u, 128u, 128u, 1u, 2848 4u, 7u, 10u, 13u}; 2849 2850 static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, 2851 128u, 128u, 128u, 128u, 128u, 128u, 2852 128u, 128u, 128u, 128u}; 2853 static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, 2854 3u, 6u, 9u, 12u, 15u, 128u, 2855 128u, 128u, 128u, 128u}; 2856 static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, 2857 128u, 128u, 128u, 128u, 128u, 2u, 2858 5u, 8u, 11u, 14u}; 2859 2860 static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, 2861 128u, 128u, 128u, 128u, 128u, 128u, 2862 128u, 128u, 128u, 128u}; 2863 static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, 2864 4u, 7u, 10u, 13u, 128u, 128u, 2865 128u, 128u, 128u, 128u}; 2866 static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, 2867 128u, 128u, 128u, 128u, 0u, 3u, 2868 6u, 9u, 12u, 15u}; 2869 2870 void SplitRGBRow_SSSE3(const uint8* src_rgb, 2871 uint8* dst_r, 2872 uint8* dst_g, 2873 uint8* dst_b, 2874 int width) { 2875 asm volatile ( 2876 LABELALIGN 2877 "1: \n" 2878 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2879 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2880 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2881 "pshufb %5, %%xmm0 \n" 2882 "pshufb %6, %%xmm1 \n" 2883 "pshufb %7, %%xmm2 \n" 2884 "por %%xmm1,%%xmm0 \n" 2885 "por %%xmm2,%%xmm0 \n" 2886 "movdqu %%xmm0," MEMACCESS(1) " \n" 2887 "lea " MEMLEA(0x10,1) ",%1 \n" 2888 2889 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2890 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2891 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2892 "pshufb %8, %%xmm0 \n" 2893 "pshufb %9, %%xmm1 \n" 2894 "pshufb %10, %%xmm2 \n" 2895 "por %%xmm1,%%xmm0 \n" 2896 "por %%xmm2,%%xmm0 \n" 2897 "movdqu %%xmm0," MEMACCESS(2) " \n" 2898 "lea " MEMLEA(0x10,2) ",%2 \n" 2899 2900 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2901 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2902 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2903 "pshufb %11, %%xmm0 \n" 2904 "pshufb %12, %%xmm1 \n" 2905 "pshufb %13, %%xmm2 \n" 2906 "por %%xmm1,%%xmm0 \n" 2907 "por %%xmm2,%%xmm0 \n" 2908 "movdqu %%xmm0," MEMACCESS(3) " \n" 2909 "lea " MEMLEA(0x10,3) ",%3 \n" 2910 "lea " MEMLEA(0x30,0) ",%0 \n" 2911 "sub $0x10,%4 \n" 2912 "jg 1b \n" 2913 : "+r"(src_rgb), // %0 2914 "+r"(dst_r), // %1 2915 "+r"(dst_g), // %2 2916 "+r"(dst_b), // %3 2917 "+r"(width) // %4 2918 : "m"(kShuffleMaskRGBToR0), // %5 2919 "m"(kShuffleMaskRGBToR1), // %6 2920 "m"(kShuffleMaskRGBToR2), // %7 2921 "m"(kShuffleMaskRGBToG0), // %8 2922 "m"(kShuffleMaskRGBToG1), // %9 2923 "m"(kShuffleMaskRGBToG2), // %10 2924 "m"(kShuffleMaskRGBToB0), // %11 2925 "m"(kShuffleMaskRGBToB1), // %12 2926 "m"(kShuffleMaskRGBToB2) // %13 2927 : "memory", "cc", NACL_R14 2928 "xmm0", "xmm1", "xmm2" 2929 ); 2930 } 2931 #endif // HAS_SPLITRGBROW_SSSE3 2932 2933 #ifdef HAS_MERGERGBROW_SSSE3 2934 2935 // Shuffle table for converting RGB to Planar. 2936 static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, 2937 2u, 128u, 128u, 3u, 128u, 128u, 2938 4u, 128u, 128u, 5u}; 2939 static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, 2940 128u, 2u, 128u, 128u, 3u, 128u, 2941 128u, 4u, 128u, 128u}; 2942 static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, 2943 128u, 128u, 2u, 128u, 128u, 3u, 2944 128u, 128u, 4u, 128u}; 2945 2946 static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, 2947 7u, 128u, 128u, 8u, 128u, 128u, 2948 9u, 128u, 128u, 10u}; 2949 static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, 2950 128u, 7u, 128u, 128u, 8u, 128u, 2951 128u, 9u, 128u, 128u}; 2952 static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, 2953 128u, 128u, 8u, 128u, 128u, 9u, 2954 128u, 128u, 10u, 128u}; 2955 2956 static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, 2957 12u, 128u, 128u, 13u, 128u, 128u, 2958 14u, 128u, 128u, 15u}; 2959 static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, 2960 128u, 13u, 128u, 128u, 14u, 128u, 2961 128u, 15u, 128u, 128u}; 2962 static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, 2963 128u, 128u, 13u, 128u, 128u, 14u, 2964 128u, 128u, 15u, 128u}; 2965 2966 void MergeRGBRow_SSSE3(const uint8* src_r, 2967 const uint8* src_g, 2968 const uint8* src_b, 2969 uint8* dst_rgb, 2970 int width) { 2971 asm volatile ( 2972 LABELALIGN 2973 "1: \n" 2974 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2975 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2976 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2977 "pshufb %5, %%xmm0 \n" 2978 "pshufb %6, %%xmm1 \n" 2979 "pshufb %7, %%xmm2 \n" 2980 "por %%xmm1,%%xmm0 \n" 2981 "por %%xmm2,%%xmm0 \n" 2982 "movdqu %%xmm0," MEMACCESS(3) " \n" 2983 2984 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2985 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2986 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2987 "pshufb %8, %%xmm0 \n" 2988 "pshufb %9, %%xmm1 \n" 2989 "pshufb %10, %%xmm2 \n" 2990 "por %%xmm1,%%xmm0 \n" 2991 "por %%xmm2,%%xmm0 \n" 2992 "movdqu %%xmm0," MEMACCESS2(16, 3) " \n" 2993 2994 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2995 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2996 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2997 "pshufb %11, %%xmm0 \n" 2998 "pshufb %12, %%xmm1 \n" 2999 "pshufb %13, %%xmm2 \n" 3000 "por %%xmm1,%%xmm0 \n" 3001 "por %%xmm2,%%xmm0 \n" 3002 "movdqu %%xmm0," MEMACCESS2(32, 3) " \n" 3003 3004 "lea " MEMLEA(0x10,0) ",%0 \n" 3005 "lea " MEMLEA(0x10,1) ",%1 \n" 3006 "lea " MEMLEA(0x10,2) ",%2 \n" 3007 "lea " MEMLEA(0x30,3) ",%3 \n" 3008 "sub $0x10,%4 \n" 3009 "jg 1b \n" 3010 : "+r"(src_r), // %0 3011 "+r"(src_g), // %1 3012 "+r"(src_b), // %2 3013 "+r"(dst_rgb), // %3 3014 "+r"(width) // %4 3015 : "m"(kShuffleMaskRToRGB0), // %5 3016 "m"(kShuffleMaskGToRGB0), // %6 3017 "m"(kShuffleMaskBToRGB0), // %7 3018 "m"(kShuffleMaskRToRGB1), // %8 3019 "m"(kShuffleMaskGToRGB1), // %9 3020 "m"(kShuffleMaskBToRGB1), // %10 3021 "m"(kShuffleMaskRToRGB2), // %11 3022 "m"(kShuffleMaskGToRGB2), // %12 3023 "m"(kShuffleMaskBToRGB2) // %13 3024 : "memory", "cc", NACL_R14 3025 "xmm0", "xmm1", "xmm2" 3026 ); 3027 } 3028 #endif // HAS_MERGERGBROW_SSSE3 2756 3029 2757 3030 #ifdef HAS_COPYROW_SSE2 … … 5454 5727 static float kScaleBias = 1.9259299444e-34f; 5455 5728 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { 5729 scale *= kScaleBias; 5456 5730 asm volatile ( 5457 5731 "pshufd $0x0,%3,%%xmm4 \n" … … 5480 5754 "+r"(dst), // %1 5481 5755 "+r"(width) // %2 5482 : "x"(scale * kScaleBias) // %3 5756 #if defined(__x86_64__) 5757 : "x"(scale) // %3 5758 #else 5759 : "m"(scale) // %3 5760 #endif 5483 5761 : "memory", "cc", 5484 5762 "xmm2", "xmm3", "xmm4", "xmm5" … … 5489 5767 #ifdef HAS_HALFFLOATROW_AVX2 5490 5768 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5769 scale *= kScaleBias; 5491 5770 asm volatile ( 5492 5771 "vbroadcastss %3, %%ymm4 \n" … … 5516 5795 "+r"(dst), // %1 5517 5796 "+r"(width) // %2 5518 : "x"(scale * kScaleBias) // %3 5797 #if defined(__x86_64__) 5798 : "x"(scale) // %3 5799 #else 5800 : "m"(scale) // %3 5801 #endif 5519 5802 : "memory", "cc", 5520 5803 "xmm2", "xmm3", "xmm4", "xmm5" … … 5549 5832 "+r"(dst), // %1 5550 5833 "+r"(width) // %2 5834 #if defined(__x86_64__) 5551 5835 : "x"(scale) // %3 5836 #else 5837 : "m"(scale) // %3 5838 #endif 5552 5839 : "memory", "cc", 5553 5840 "xmm2", "xmm3", "xmm4" -
pjproject/trunk/third_party/yuv/source/row_msa.cc
r5633 r5699 2918 2918 void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { 2919 2919 int x; 2920 v 16u8 dst0 = (v16u8)__msa_fill_w(v32);2920 v4i32 dst0 = __builtin_msa_fill_w(v32); 2921 2921 2922 2922 for (x = 0; x < width; x += 4) { … … 2970 2970 } 2971 2971 2972 void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width) { 2973 int i; 2974 v16u8 src0, src1, src2, src3, vec0, vec1, dst0; 2975 2976 for (i = 0; i < width; i += 16) { 2977 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 2978 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 2979 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); 2980 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); 2981 vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 2982 vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 2983 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 2984 ST_UB(dst0, dst_a); 2985 src_argb += 64; 2986 dst_a += 16; 2987 } 2988 } 2989 2990 void ARGBBlendRow_MSA(const uint8* src_argb0, 2991 const uint8* src_argb1, 2992 uint8* dst_argb, 2993 int width) { 2994 int x; 2995 v16u8 src0, src1, src2, src3, dst0, dst1; 2996 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2997 v8u16 vec8, vec9, vec10, vec11, vec12, vec13; 2998 v8u16 const_256 = (v8u16)__msa_ldi_h(256); 2999 v16u8 const_255 = (v16u8)__msa_ldi_b(255); 3000 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; 3001 v16i8 zero = {0}; 3002 3003 for (x = 0; x < width; x += 8) { 3004 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 3005 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 3006 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); 3007 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); 3008 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); 3009 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); 3010 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); 3011 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); 3012 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); 3013 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); 3014 vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); 3015 vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); 3016 vec8 = (v8u16)__msa_fill_h(vec0[3]); 3017 vec9 = (v8u16)__msa_fill_h(vec0[7]); 3018 vec10 = (v8u16)__msa_fill_h(vec1[3]); 3019 vec11 = (v8u16)__msa_fill_h(vec1[7]); 3020 vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); 3021 vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); 3022 vec10 = (v8u16)__msa_fill_h(vec2[3]); 3023 vec11 = (v8u16)__msa_fill_h(vec2[7]); 3024 vec12 = (v8u16)__msa_fill_h(vec3[3]); 3025 vec13 = (v8u16)__msa_fill_h(vec3[7]); 3026 vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); 3027 vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); 3028 vec8 = const_256 - vec8; 3029 vec9 = const_256 - vec9; 3030 vec10 = const_256 - vec10; 3031 vec11 = const_256 - vec11; 3032 vec8 *= vec4; 3033 vec9 *= vec5; 3034 vec10 *= vec6; 3035 vec11 *= vec7; 3036 vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); 3037 vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); 3038 vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); 3039 vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); 3040 vec0 += vec8; 3041 vec1 += vec9; 3042 vec2 += vec10; 3043 vec3 += vec11; 3044 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3045 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 3046 dst0 = __msa_bmnz_v(dst0, const_255, mask); 3047 dst1 = __msa_bmnz_v(dst1, const_255, mask); 3048 ST_UB2(dst0, dst1, dst_argb, 16); 3049 src_argb0 += 32; 3050 src_argb1 += 32; 3051 dst_argb += 32; 3052 } 3053 } 3054 3055 void ARGBQuantizeRow_MSA(uint8* dst_argb, 3056 int scale, 3057 int interval_size, 3058 int interval_offset, 3059 int width) { 3060 int x; 3061 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3062 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3063 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3064 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 3065 v4i32 vec_scale = __msa_fill_w(scale); 3066 v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); 3067 v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); 3068 v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; 3069 v16i8 zero = {0}; 3070 3071 for (x = 0; x < width; x += 8) { 3072 src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0); 3073 src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16); 3074 src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32); 3075 src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48); 3076 vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); 3077 vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); 3078 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); 3079 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); 3080 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); 3081 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); 3082 vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); 3083 vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); 3084 tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); 3085 tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); 3086 tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); 3087 tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); 3088 tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); 3089 tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); 3090 tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); 3091 tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); 3092 tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); 3093 tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); 3094 tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); 3095 tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); 3096 tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); 3097 tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); 3098 tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); 3099 tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); 3100 tmp0 *= vec_scale; 3101 tmp1 *= vec_scale; 3102 tmp2 *= vec_scale; 3103 tmp3 *= vec_scale; 3104 tmp4 *= vec_scale; 3105 tmp5 *= vec_scale; 3106 tmp6 *= vec_scale; 3107 tmp7 *= vec_scale; 3108 tmp8 *= vec_scale; 3109 tmp9 *= vec_scale; 3110 tmp10 *= vec_scale; 3111 tmp11 *= vec_scale; 3112 tmp12 *= vec_scale; 3113 tmp13 *= vec_scale; 3114 tmp14 *= vec_scale; 3115 tmp15 *= vec_scale; 3116 tmp0 >>= 16; 3117 tmp1 >>= 16; 3118 tmp2 >>= 16; 3119 tmp3 >>= 16; 3120 tmp4 >>= 16; 3121 tmp5 >>= 16; 3122 tmp6 >>= 16; 3123 tmp7 >>= 16; 3124 tmp8 >>= 16; 3125 tmp9 >>= 16; 3126 tmp10 >>= 16; 3127 tmp11 >>= 16; 3128 tmp12 >>= 16; 3129 tmp13 >>= 16; 3130 tmp14 >>= 16; 3131 tmp15 >>= 16; 3132 vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3133 vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3134 vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3135 vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3136 vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); 3137 vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); 3138 vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); 3139 vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); 3140 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3141 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 3142 dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 3143 dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); 3144 dst0 *= vec_int_sz; 3145 dst1 *= vec_int_sz; 3146 dst2 *= vec_int_sz; 3147 dst3 *= vec_int_sz; 3148 dst0 += vec_int_ofst; 3149 dst1 += vec_int_ofst; 3150 dst2 += vec_int_ofst; 3151 dst3 += vec_int_ofst; 3152 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); 3153 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); 3154 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); 3155 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); 3156 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 3157 dst_argb += 64; 3158 } 3159 } 3160 3161 void ARGBColorMatrixRow_MSA(const uint8* src_argb, 3162 uint8* dst_argb, 3163 const int8* matrix_argb, 3164 int width) { 3165 int32 x; 3166 v16i8 src0; 3167 v16u8 src1, src2, dst0, dst1; 3168 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3169 v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3170 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3171 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 3172 v16i8 zero = {0}; 3173 v8i16 max = __msa_ldi_h(255); 3174 3175 src0 = __msa_ld_b((v16i8*)matrix_argb, 0); 3176 vec0 = (v8i16)__msa_ilvr_b(zero, src0); 3177 vec1 = (v8i16)__msa_ilvl_b(zero, src0); 3178 3179 for (x = 0; x < width; x += 8) { 3180 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 3181 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 3182 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); 3183 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); 3184 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); 3185 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); 3186 vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); 3187 vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); 3188 vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); 3189 vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); 3190 vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); 3191 vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); 3192 vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); 3193 vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); 3194 vec10 = vec2 * vec0; 3195 vec11 = vec2 * vec1; 3196 vec12 = vec6 * vec0; 3197 vec13 = vec6 * vec1; 3198 tmp0 = __msa_hadd_s_w(vec10, vec10); 3199 tmp1 = __msa_hadd_s_w(vec11, vec11); 3200 tmp2 = __msa_hadd_s_w(vec12, vec12); 3201 tmp3 = __msa_hadd_s_w(vec13, vec13); 3202 vec14 = vec3 * vec0; 3203 vec15 = vec3 * vec1; 3204 vec16 = vec7 * vec0; 3205 vec17 = vec7 * vec1; 3206 tmp4 = __msa_hadd_s_w(vec14, vec14); 3207 tmp5 = __msa_hadd_s_w(vec15, vec15); 3208 tmp6 = __msa_hadd_s_w(vec16, vec16); 3209 tmp7 = __msa_hadd_s_w(vec17, vec17); 3210 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3211 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3212 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3213 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3214 tmp0 = __msa_hadd_s_w(vec10, vec10); 3215 tmp1 = __msa_hadd_s_w(vec11, vec11); 3216 tmp2 = __msa_hadd_s_w(vec12, vec12); 3217 tmp3 = __msa_hadd_s_w(vec13, vec13); 3218 tmp0 = __msa_srai_w(tmp0, 6); 3219 tmp1 = __msa_srai_w(tmp1, 6); 3220 tmp2 = __msa_srai_w(tmp2, 6); 3221 tmp3 = __msa_srai_w(tmp3, 6); 3222 vec2 = vec4 * vec0; 3223 vec6 = vec4 * vec1; 3224 vec3 = vec8 * vec0; 3225 vec7 = vec8 * vec1; 3226 tmp8 = __msa_hadd_s_w(vec2, vec2); 3227 tmp9 = __msa_hadd_s_w(vec6, vec6); 3228 tmp10 = __msa_hadd_s_w(vec3, vec3); 3229 tmp11 = __msa_hadd_s_w(vec7, vec7); 3230 vec4 = vec5 * vec0; 3231 vec8 = vec5 * vec1; 3232 vec5 = vec9 * vec0; 3233 vec9 = vec9 * vec1; 3234 tmp12 = __msa_hadd_s_w(vec4, vec4); 3235 tmp13 = __msa_hadd_s_w(vec8, vec8); 3236 tmp14 = __msa_hadd_s_w(vec5, vec5); 3237 tmp15 = __msa_hadd_s_w(vec9, vec9); 3238 vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); 3239 vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); 3240 vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); 3241 vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); 3242 tmp4 = __msa_hadd_s_w(vec14, vec14); 3243 tmp5 = __msa_hadd_s_w(vec15, vec15); 3244 tmp6 = __msa_hadd_s_w(vec16, vec16); 3245 tmp7 = __msa_hadd_s_w(vec17, vec17); 3246 tmp4 = __msa_srai_w(tmp4, 6); 3247 tmp5 = __msa_srai_w(tmp5, 6); 3248 tmp6 = __msa_srai_w(tmp6, 6); 3249 tmp7 = __msa_srai_w(tmp7, 6); 3250 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3251 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3252 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3253 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3254 vec10 = __msa_maxi_s_h(vec10, 0); 3255 vec11 = __msa_maxi_s_h(vec11, 0); 3256 vec12 = __msa_maxi_s_h(vec12, 0); 3257 vec13 = __msa_maxi_s_h(vec13, 0); 3258 vec10 = __msa_min_s_h(vec10, max); 3259 vec11 = __msa_min_s_h(vec11, max); 3260 vec12 = __msa_min_s_h(vec12, max); 3261 vec13 = __msa_min_s_h(vec13, max); 3262 dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); 3263 dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); 3264 ST_UB2(dst0, dst1, dst_argb, 16); 3265 src_argb += 32; 3266 dst_argb += 32; 3267 } 3268 } 3269 3270 void SplitUVRow_MSA(const uint8* src_uv, 3271 uint8* dst_u, 3272 uint8* dst_v, 3273 int width) { 3274 int x; 3275 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3276 3277 for (x = 0; x < width; x += 32) { 3278 src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); 3279 src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); 3280 src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); 3281 src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); 3282 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 3283 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 3284 dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 3285 dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 3286 ST_UB2(dst0, dst1, dst_u, 16); 3287 ST_UB2(dst2, dst3, dst_v, 16); 3288 src_uv += 64; 3289 dst_u += 32; 3290 dst_v += 32; 3291 } 3292 } 3293 3294 void SetRow_MSA(uint8* dst, uint8 v8, int width) { 3295 int x; 3296 v16u8 dst0 = (v16u8)__msa_fill_b(v8); 3297 3298 for (x = 0; x < width; x += 16) { 3299 ST_UB(dst0, dst); 3300 dst += 16; 3301 } 3302 } 3303 3304 void MirrorUVRow_MSA(const uint8* src_uv, 3305 uint8* dst_u, 3306 uint8* dst_v, 3307 int width) { 3308 int x; 3309 v16u8 src0, src1, src2, src3; 3310 v16u8 dst0, dst1, dst2, dst3; 3311 v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; 3312 v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; 3313 3314 src_uv += (2 * width); 3315 3316 for (x = 0; x < width; x += 32) { 3317 src_uv -= 64; 3318 src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); 3319 src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); 3320 src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); 3321 src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); 3322 dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 3323 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 3324 dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 3325 dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); 3326 ST_UB2(dst0, dst1, dst_v, 16); 3327 ST_UB2(dst2, dst3, dst_u, 16); 3328 dst_u += 32; 3329 dst_v += 32; 3330 } 3331 } 3332 3333 void SobelXRow_MSA(const uint8* src_y0, 3334 const uint8* src_y1, 3335 const uint8* src_y2, 3336 uint8* dst_sobelx, 3337 int32 width) { 3338 int x; 3339 v16u8 src0, src1, src2, src3, src4, src5, dst0; 3340 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 3341 v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; 3342 v16i8 tmp = __msa_ldi_b(8); 3343 v16i8 mask1 = mask0 + tmp; 3344 v8i16 zero = {0}; 3345 v8i16 max = __msa_ldi_h(255); 3346 3347 for (x = 0; x < width; x += 16) { 3348 src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); 3349 src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16); 3350 src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); 3351 src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16); 3352 src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0); 3353 src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16); 3354 vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 3355 vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 3356 vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); 3357 vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 3358 vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); 3359 vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 3360 vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); 3361 vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); 3362 vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); 3363 vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); 3364 vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); 3365 vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); 3366 vec0 += vec2; 3367 vec1 += vec3; 3368 vec4 += vec2; 3369 vec5 += vec3; 3370 vec0 += vec4; 3371 vec1 += vec5; 3372 vec0 = __msa_add_a_h(zero, vec0); 3373 vec1 = __msa_add_a_h(zero, vec1); 3374 vec0 = __msa_maxi_s_h(vec0, 0); 3375 vec1 = __msa_maxi_s_h(vec1, 0); 3376 vec0 = __msa_min_s_h(max, vec0); 3377 vec1 = __msa_min_s_h(max, vec1); 3378 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3379 ST_UB(dst0, dst_sobelx); 3380 src_y0 += 16; 3381 src_y1 += 16; 3382 src_y2 += 16; 3383 dst_sobelx += 16; 3384 } 3385 } 3386 3387 void SobelYRow_MSA(const uint8* src_y0, 3388 const uint8* src_y1, 3389 uint8* dst_sobely, 3390 int32 width) { 3391 int x; 3392 v16u8 src0, src1, dst0; 3393 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; 3394 v8i16 zero = {0}; 3395 v8i16 max = __msa_ldi_h(255); 3396 3397 for (x = 0; x < width; x += 16) { 3398 src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); 3399 src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); 3400 vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); 3401 vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); 3402 vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); 3403 vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); 3404 vec0 -= vec2; 3405 vec1 -= vec3; 3406 vec6[0] = src_y0[16] - src_y1[16]; 3407 vec6[1] = src_y0[17] - src_y1[17]; 3408 vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); 3409 vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); 3410 vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); 3411 vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); 3412 vec0 += vec2; 3413 vec1 += vec3; 3414 vec4 += vec2; 3415 vec5 += vec3; 3416 vec0 += vec4; 3417 vec1 += vec5; 3418 vec0 = __msa_add_a_h(zero, vec0); 3419 vec1 = __msa_add_a_h(zero, vec1); 3420 vec0 = __msa_maxi_s_h(vec0, 0); 3421 vec1 = __msa_maxi_s_h(vec1, 0); 3422 vec0 = __msa_min_s_h(max, vec0); 3423 vec1 = __msa_min_s_h(max, vec1); 3424 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3425 ST_UB(dst0, dst_sobely); 3426 src_y0 += 16; 3427 src_y1 += 16; 3428 dst_sobely += 16; 3429 } 3430 } 3431 3432 void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) { 3433 int i; 3434 v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3435 v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3436 v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; 3437 v4f32 mult_vec; 3438 v8i16 zero = {0}; 3439 mult_vec[0] = 1.9259299444e-34f * scale; 3440 mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); 3441 3442 for (i = 0; i < width; i += 32) { 3443 src0 = (v8u16)__msa_ld_h((v8i16*)src, 0); 3444 src1 = (v8u16)__msa_ld_h((v8i16*)src, 16); 3445 src2 = (v8u16)__msa_ld_h((v8i16*)src, 32); 3446 src3 = (v8u16)__msa_ld_h((v8i16*)src, 48); 3447 vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); 3448 vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); 3449 vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); 3450 vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); 3451 vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); 3452 vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); 3453 vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); 3454 vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); 3455 fvec0 = __msa_ffint_u_w(vec0); 3456 fvec1 = __msa_ffint_u_w(vec1); 3457 fvec2 = __msa_ffint_u_w(vec2); 3458 fvec3 = __msa_ffint_u_w(vec3); 3459 fvec4 = __msa_ffint_u_w(vec4); 3460 fvec5 = __msa_ffint_u_w(vec5); 3461 fvec6 = __msa_ffint_u_w(vec6); 3462 fvec7 = __msa_ffint_u_w(vec7); 3463 fvec0 *= mult_vec; 3464 fvec1 *= mult_vec; 3465 fvec2 *= mult_vec; 3466 fvec3 *= mult_vec; 3467 fvec4 *= mult_vec; 3468 fvec5 *= mult_vec; 3469 fvec6 *= mult_vec; 3470 fvec7 *= mult_vec; 3471 vec0 = ((v4u32)fvec0) >> 13; 3472 vec1 = ((v4u32)fvec1) >> 13; 3473 vec2 = ((v4u32)fvec2) >> 13; 3474 vec3 = ((v4u32)fvec3) >> 13; 3475 vec4 = ((v4u32)fvec4) >> 13; 3476 vec5 = ((v4u32)fvec5) >> 13; 3477 vec6 = ((v4u32)fvec6) >> 13; 3478 vec7 = ((v4u32)fvec7) >> 13; 3479 dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); 3480 dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); 3481 dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); 3482 dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); 3483 ST_UH2(dst0, dst1, dst, 8); 3484 ST_UH2(dst2, dst3, dst + 16, 8); 3485 src += 32; 3486 dst += 32; 3487 } 3488 } 3489 2972 3490 #ifdef __cplusplus 2973 3491 } // extern "C" -
pjproject/trunk/third_party/yuv/source/row_neon.cc
r5633 r5699 116 116 YUVTORGB_SETUP 117 117 "vmov.u8 d23, #255 \n" 118 "1: 118 "1: \n" READYUV444 YUVTORGB 119 119 "subs %4, %4, #8 \n" 120 120 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" … … 142 142 YUVTORGB_SETUP 143 143 "vmov.u8 d23, #255 \n" 144 "1: 144 "1: \n" READYUV422 YUVTORGB 145 145 "subs %4, %4, #8 \n" 146 146 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" … … 168 168 asm volatile( 169 169 YUVTORGB_SETUP 170 "1: 170 "1: \n" READYUV422 YUVTORGB 171 171 "subs %5, %5, #8 \n" 172 172 "vld1.8 {d23}, [%3]! \n" … … 195 195 asm volatile( 196 196 YUVTORGB_SETUP 197 "1: 197 "1: \n" READYUV422 YUVTORGB 198 198 "subs %4, %4, #8 \n" 199 199 "vmov.u8 d19, #255 \n" // d19 modified by … … 222 222 asm volatile( 223 223 YUVTORGB_SETUP 224 "1: 224 "1: \n" READYUV422 YUVTORGB 225 225 "subs %4, %4, #8 \n" 226 226 "vst3.8 {d20, d21, d22}, [%3]! \n" … … 254 254 asm volatile( 255 255 YUVTORGB_SETUP 256 "1: 256 "1: \n" READYUV422 YUVTORGB 257 257 "subs %4, %4, #8 \n" ARGBTORGB565 258 258 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. … … 288 288 asm volatile( 289 289 YUVTORGB_SETUP 290 "1: 290 "1: \n" READYUV422 YUVTORGB 291 291 "subs %4, %4, #8 \n" 292 292 "vmov.u8 d23, #255 \n" ARGBTOARGB1555 … … 326 326 "vmov.u8 d4, #0x0f \n" // bits to clear with 327 327 // vbic. 328 "1: 328 "1: \n" READYUV422 YUVTORGB 329 329 "subs %4, %4, #8 \n" 330 330 "vmov.u8 d23, #255 \n" ARGBTOARGB4444 … … 349 349 YUVTORGB_SETUP 350 350 "vmov.u8 d23, #255 \n" 351 "1: 351 "1: \n" READYUV400 YUVTORGB 352 352 "subs %2, %2, #8 \n" 353 353 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" … … 367 367 asm volatile( 368 368 "vmov.u8 d23, #255 \n" 369 "1: 369 "1: \n" 370 370 "vld1.8 {d20}, [%0]! \n" 371 371 "vmov d21, d20 \n" … … 386 386 const struct YuvConstants* yuvconstants, 387 387 int width) { 388 asm volatile( 389 YUVTORGB_SETUP 390 "vmov.u8 d23, #255 \n" 391 "1: \n" READNV12 YUVTORGB 392 "subs %3, %3, #8 \n" 393 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 394 "bgt 1b \n" 395 : "+r"(src_y), // %0 396 "+r"(src_uv), // %1 397 "+r"(dst_argb), // %2 398 "+r"(width) // %3 399 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 400 [kUVToG] "r"(&yuvconstants->kUVToG), 401 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 402 [kYToRgb] "r"(&yuvconstants->kYToRgb) 403 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 404 "q12", "q13", "q14", "q15"); 388 asm volatile(YUVTORGB_SETUP 389 "vmov.u8 d23, #255 \n" 390 "1: \n" READNV12 YUVTORGB 391 "subs %3, %3, #8 \n" 392 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 393 "bgt 1b \n" 394 : "+r"(src_y), // %0 395 "+r"(src_uv), // %1 396 "+r"(dst_argb), // %2 397 "+r"(width) // %3 398 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 399 [kUVToG] "r"(&yuvconstants->kUVToG), 400 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 401 [kYToRgb] "r"(&yuvconstants->kYToRgb) 402 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 403 "q10", "q11", "q12", "q13", "q14", "q15"); 405 404 } 406 405 … … 410 409 const struct YuvConstants* yuvconstants, 411 410 int width) { 412 asm volatile( 413 YUVTORGB_SETUP 414 "vmov.u8 d23, #255 \n" 415 "1: \n" READNV21 YUVTORGB 416 "subs %3, %3, #8 \n" 417 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 418 "bgt 1b \n" 419 : "+r"(src_y), // %0 420 "+r"(src_vu), // %1 421 "+r"(dst_argb), // %2 422 "+r"(width) // %3 423 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 424 [kUVToG] "r"(&yuvconstants->kUVToG), 425 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 426 [kYToRgb] "r"(&yuvconstants->kYToRgb) 427 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 428 "q12", "q13", "q14", "q15"); 411 asm volatile(YUVTORGB_SETUP 412 "vmov.u8 d23, #255 \n" 413 "1: \n" READNV21 YUVTORGB 414 "subs %3, %3, #8 \n" 415 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 416 "bgt 1b \n" 417 : "+r"(src_y), // %0 418 "+r"(src_vu), // %1 419 "+r"(dst_argb), // %2 420 "+r"(width) // %3 421 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 422 [kUVToG] "r"(&yuvconstants->kUVToG), 423 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 424 [kYToRgb] "r"(&yuvconstants->kYToRgb) 425 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 426 "q10", "q11", "q12", "q13", "q14", "q15"); 429 427 } 430 428 … … 436 434 asm volatile( 437 435 YUVTORGB_SETUP 438 "1: 436 "1: \n" READNV12 YUVTORGB 439 437 "subs %3, %3, #8 \n" ARGBTORGB565 440 438 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. … … 456 454 const struct YuvConstants* yuvconstants, 457 455 int width) { 458 asm volatile( 459 YUVTORGB_SETUP 460 "vmov.u8 d23, #255 \n" 461 "1: \n" READYUY2 YUVTORGB 462 "subs %2, %2, #8 \n" 463 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 464 "bgt 1b \n" 465 : "+r"(src_yuy2), // %0 466 "+r"(dst_argb), // %1 467 "+r"(width) // %2 468 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 469 [kUVToG] "r"(&yuvconstants->kUVToG), 470 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 471 [kYToRgb] "r"(&yuvconstants->kYToRgb) 472 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 473 "q12", "q13", "q14", "q15"); 456 asm volatile(YUVTORGB_SETUP 457 "vmov.u8 d23, #255 \n" 458 "1: \n" READYUY2 YUVTORGB 459 "subs %2, %2, #8 \n" 460 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 461 "bgt 1b \n" 462 : "+r"(src_yuy2), // %0 463 "+r"(dst_argb), // %1 464 "+r"(width) // %2 465 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 466 [kUVToG] "r"(&yuvconstants->kUVToG), 467 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 468 [kYToRgb] "r"(&yuvconstants->kYToRgb) 469 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 470 "q10", "q11", "q12", "q13", "q14", "q15"); 474 471 } 475 472 … … 478 475 const struct YuvConstants* yuvconstants, 479 476 int width) { 480 asm volatile( 481 YUVTORGB_SETUP 482 "vmov.u8 d23, #255 \n" 483 "1: \n" READUYVY YUVTORGB 484 "subs %2, %2, #8 \n" 485 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 486 "bgt 1b \n" 487 : "+r"(src_uyvy), // %0 488 "+r"(dst_argb), // %1 489 "+r"(width) // %2 490 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 491 [kUVToG] "r"(&yuvconstants->kUVToG), 492 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 493 [kYToRgb] "r"(&yuvconstants->kYToRgb) 494 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 495 "q12", "q13", "q14", "q15"); 477 asm volatile(YUVTORGB_SETUP 478 "vmov.u8 d23, #255 \n" 479 "1: \n" READUYVY YUVTORGB 480 "subs %2, %2, #8 \n" 481 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 482 "bgt 1b \n" 483 : "+r"(src_uyvy), // %0 484 "+r"(dst_argb), // %1 485 "+r"(width) // %2 486 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 487 [kUVToG] "r"(&yuvconstants->kUVToG), 488 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 489 [kYToRgb] "r"(&yuvconstants->kYToRgb) 490 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 491 "q10", "q11", "q12", "q13", "q14", "q15"); 496 492 } 497 493 … … 502 498 int width) { 503 499 asm volatile( 504 "1: 500 "1: \n" 505 501 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV 506 502 "subs %3, %3, #16 \n" // 16 processed per loop … … 523 519 int width) { 524 520 asm volatile( 525 "1: 521 "1: \n" 526 522 "vld1.8 {q0}, [%0]! \n" // load U 527 523 "vld1.8 {q1}, [%1]! \n" // load V 528 524 "subs %3, %3, #16 \n" // 16 processed per loop 529 "vst2. u8{q0, q1}, [%2]! \n" // store 16 pairs of UV525 "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV 530 526 "bgt 1b \n" 531 527 : "+r"(src_u), // %0 … … 538 534 } 539 535 536 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. 537 void SplitRGBRow_NEON(const uint8* src_rgb, 538 uint8* dst_r, 539 uint8* dst_g, 540 uint8* dst_b, 541 int width) { 542 asm volatile( 543 "1: \n" 544 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB 545 "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB 546 "subs %4, %4, #16 \n" // 16 processed per loop 547 "vst1.8 {q0}, [%1]! \n" // store R 548 "vst1.8 {q1}, [%2]! \n" // store G 549 "vst1.8 {q2}, [%3]! \n" // store B 550 "bgt 1b \n" 551 : "+r"(src_rgb), // %0 552 "+r"(dst_r), // %1 553 "+r"(dst_g), // %2 554 "+r"(dst_b), // %3 555 "+r"(width) // %4 556 : // Input registers 557 : "cc", "memory", "d0", "d1", "d2" // Clobber List 558 ); 559 } 560 561 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time 562 void MergeRGBRow_NEON(const uint8* src_r, 563 const uint8* src_g, 564 const uint8* src_b, 565 uint8* dst_rgb, 566 int width) { 567 asm volatile( 568 "1: \n" 569 "vld1.8 {q0}, [%0]! \n" // load R 570 "vld1.8 {q1}, [%1]! \n" // load G 571 "vld1.8 {q2}, [%2]! \n" // load B 572 "subs %4, %4, #16 \n" // 16 processed per loop 573 "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB 574 "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB 575 "bgt 1b \n" 576 : "+r"(src_r), // %0 577 "+r"(src_g), // %1 578 "+r"(src_b), // %2 579 "+r"(dst_rgb), // %3 580 "+r"(width) // %4 581 : // Input registers 582 : "cc", "memory", "q0", "q1", "q2" // Clobber List 583 ); 584 } 585 540 586 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 541 587 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 542 588 asm volatile( 543 "1: 589 "1: \n" 544 590 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 545 591 "subs %2, %2, #32 \n" // 32 processed per loop … … 558 604 asm volatile( 559 605 "vdup.8 q0, %2 \n" // duplicate 16 bytes 560 "1: 606 "1: \n" 561 607 "subs %1, %1, #16 \n" // 16 bytes per loop 562 608 "vst1.8 {q0}, [%0]! \n" // store … … 572 618 asm volatile( 573 619 "vdup.u32 q0, %2 \n" // duplicate 4 ints 574 "1: 620 "1: \n" 575 621 "subs %1, %1, #4 \n" // 4 pixels per loop 576 622 "vst1.8 {q0}, [%0]! \n" // store … … 589 635 "sub %0, #16 \n" 590 636 591 "1: 637 "1: \n" 592 638 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 593 639 "subs %2, #16 \n" // 16 pixels per loop. … … 613 659 "sub %0, #16 \n" 614 660 615 "1: 661 "1: \n" 616 662 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 617 663 "subs %3, #8 \n" // 8 pixels per loop. … … 635 681 "sub %0, #16 \n" 636 682 637 "1: 683 "1: \n" 638 684 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 639 685 "subs %2, #4 \n" // 4 pixels per loop. … … 652 698 asm volatile( 653 699 "vmov.u8 d4, #255 \n" // Alpha 654 "1: 700 "1: \n" 655 701 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. 656 702 "subs %2, %2, #8 \n" // 8 processed per loop. … … 668 714 asm volatile( 669 715 "vmov.u8 d4, #255 \n" // Alpha 670 "1: 716 "1: \n" 671 717 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. 672 718 "subs %2, %2, #8 \n" // 8 processed per loop. … … 684 730 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 685 731 asm volatile( 686 "1: 732 "1: \n" 687 733 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. 688 734 "subs %2, %2, #8 \n" // 8 processed per loop. … … 714 760 asm volatile( 715 761 "vmov.u8 d3, #255 \n" // Alpha 716 "1: 762 "1: \n" 717 763 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 718 764 "subs %2, %2, #8 \n" // 8 processed per loop. … … 760 806 asm volatile( 761 807 "vmov.u8 d3, #255 \n" // Alpha 762 "1: 808 "1: \n" 763 809 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 764 810 "subs %2, %2, #8 \n" // 8 processed per loop. … … 789 835 asm volatile( 790 836 "vmov.u8 d3, #255 \n" // Alpha 791 "1: 837 "1: \n" 792 838 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 793 839 "subs %2, %2, #8 \n" // 8 processed per loop. … … 805 851 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 806 852 asm volatile( 807 "1: 853 "1: \n" 808 854 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 809 855 "subs %2, %2, #8 \n" // 8 processed per loop. … … 821 867 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 822 868 asm volatile( 823 "1: 869 "1: \n" 824 870 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 825 871 "subs %2, %2, #8 \n" // 8 processed per loop. … … 837 883 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 838 884 asm volatile( 839 "1: 885 "1: \n" 840 886 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. 841 887 "subs %2, %2, #16 \n" // 16 processed per loop. … … 852 898 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 853 899 asm volatile( 854 "1: 900 "1: \n" 855 901 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. 856 902 "subs %2, %2, #16 \n" // 16 processed per loop. … … 870 916 int width) { 871 917 asm volatile( 872 "1: 918 "1: \n" 873 919 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 874 920 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. … … 890 936 int width) { 891 937 asm volatile( 892 "1: 938 "1: \n" 893 939 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 894 940 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. … … 912 958 asm volatile( 913 959 "add %1, %0, %1 \n" // stride + src_yuy2 914 "1: 960 "1: \n" 915 961 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 916 962 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. … … 939 985 asm volatile( 940 986 "add %1, %0, %1 \n" // stride + src_uyvy 941 "1: 987 "1: \n" 942 988 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 943 989 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. … … 966 1012 asm volatile( 967 1013 "vld1.8 {q2}, [%3] \n" // shuffler 968 "1: 1014 "1: \n" 969 1015 "vld1.8 {q0}, [%0]! \n" // load 4 pixels. 970 1016 "subs %2, %2, #4 \n" // 4 processed per loop … … 987 1033 int width) { 988 1034 asm volatile( 989 "1: 1035 "1: \n" 990 1036 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys 991 1037 "vld1.8 {d1}, [%1]! \n" // load 8 Us … … 1009 1055 int width) { 1010 1056 asm volatile( 1011 "1: 1057 "1: \n" 1012 1058 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys 1013 1059 "vld1.8 {d0}, [%1]! \n" // load 8 Us … … 1027 1073 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1028 1074 asm volatile( 1029 "1: 1075 "1: \n" 1030 1076 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1031 1077 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1046 1092 asm volatile( 1047 1093 "vdup.32 d2, %2 \n" // dither4 1048 "1: 1094 "1: \n" 1049 1095 "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. 1050 1096 "subs %3, %3, #8 \n" // 8 processed per loop. 1051 1097 "vqadd.u8 d20, d20, d2 \n" 1052 1098 "vqadd.u8 d21, d21, d2 \n" 1053 "vqadd.u8 d22, d22, d2 \n" ARGBTORGB565 1054 "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. 1099 "vqadd.u8 d22, d22, d2 \n" // add for dither 1100 ARGBTORGB565 1101 "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. 1055 1102 "bgt 1b \n" 1056 1103 : "+r"(dst_rgb) // %0 … … 1065 1112 int width) { 1066 1113 asm volatile( 1067 "1: 1114 "1: \n" 1068 1115 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1069 1116 "subs %2, %2, #8 \n" // 8 processed per loop. 1070 1117 ARGBTOARGB1555 1071 "vst1.8 {q0}, [%1]! \n" // store 8 pixels 1072 // ARGB1555. 1118 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. 1073 1119 "bgt 1b \n" 1074 1120 : "+r"(src_argb), // %0 … … 1085 1131 "vmov.u8 d4, #0x0f \n" // bits to clear with 1086 1132 // vbic. 1087 "1: 1133 "1: \n" 1088 1134 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1089 1135 "subs %2, %2, #8 \n" // 8 processed per loop. 1090 1136 ARGBTOARGB4444 1091 "vst1.8 {q0}, [%1]! \n" // store 8 pixels 1092 // ARGB4444. 1137 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. 1093 1138 "bgt 1b \n" 1094 1139 : "+r"(src_argb), // %0 … … 1105 1150 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1106 1151 "vmov.u8 d27, #16 \n" // Add 16 constant 1107 "1: 1152 "1: \n" 1108 1153 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1109 1154 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1124 1169 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1125 1170 asm volatile( 1126 "1: 1171 "1: \n" 1127 1172 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels 1128 1173 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels … … 1143 1188 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 1144 1189 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 1145 "1: 1190 "1: \n" 1146 1191 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1147 1192 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1172 1217 "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient 1173 1218 "vmov.u16 q15, #0x8080 \n" // 128.5 1174 "1: 1219 "1: \n" 1175 1220 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1176 1221 "subs %3, %3, #8 \n" // 8 processed per loop. … … 1200 1245 } 1201 1246 1247 // clang-format off 1202 1248 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1203 1249 #define RGBTOUV(QB, QG, QR) \ 1204 "vmul.s16 q8, " #QB \ 1205 ", q10 \n" /* B */ \ 1206 "vmls.s16 q8, " #QG \ 1207 ", q11 \n" /* G */ \ 1208 "vmls.s16 q8, " #QR \ 1209 ", q12 \n" /* R */ \ 1250 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ 1251 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ 1252 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ 1210 1253 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ 1211 "vmul.s16 q9, " #QR \ 1212 ", q10 \n" /* R */ \ 1213 "vmls.s16 q9, " #QG \ 1214 ", q14 \n" /* G */ \ 1215 "vmls.s16 q9, " #QB \ 1216 ", q13 \n" /* B */ \ 1254 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ 1255 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ 1256 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ 1217 1257 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ 1218 1258 "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ 1219 1259 "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ 1260 // clang-format on 1220 1261 1221 1262 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. … … 1233 1274 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1234 1275 "vmov.u16 q15, #0x8080 \n" // 128.5 1235 "1:\n"1276 "1: \n" 1236 1277 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1237 1278 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. … … 1279 1320 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient 1280 1321 "vmov.u16 q15, #0x8080 \n" // 128.5 1281 "1:\n"1322 "1: \n" 1282 1323 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1283 1324 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. … … 1324 1365 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1325 1366 "vmov.u16 q15, #0x8080 \n" // 128.5 1326 "1:\n"1367 "1: \n" 1327 1368 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. 1328 1369 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. … … 1369 1410 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1370 1411 "vmov.u16 q15, #0x8080 \n" // 128.5 1371 "1:\n"1412 "1: \n" 1372 1413 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. 1373 1414 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. … … 1414 1455 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1415 1456 "vmov.u16 q15, #0x8080 \n" // 128.5 1416 "1:\n"1457 "1: \n" 1417 1458 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. 1418 1459 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. … … 1459 1500 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1460 1501 "vmov.u16 q15, #0x8080 \n" // 128.5 1461 "1:\n"1502 "1: \n" 1462 1503 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. 1463 1504 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. … … 1504 1545 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1505 1546 "vmov.u16 q15, #0x8080 \n" // 128.5 1506 "1:\n"1547 "1: \n" 1507 1548 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. 1508 1549 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. … … 1551 1592 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1552 1593 "vmov.u16 q15, #0x8080 \n" // 128.5 1553 "1: 1594 "1: \n" 1554 1595 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1555 1596 RGB565TOARGB … … 1617 1658 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1618 1659 "vmov.u16 q15, #0x8080 \n" // 128.5 1619 "1: 1660 "1: \n" 1620 1661 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1621 1662 RGB555TOARGB … … 1683 1724 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1684 1725 "vmov.u16 q15, #0x8080 \n" // 128.5 1685 "1: 1726 "1: \n" 1686 1727 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1687 1728 ARGB4444TOARGB … … 1740 1781 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1741 1782 "vmov.u8 d27, #16 \n" // Add 16 constant 1742 "1: 1783 "1: \n" 1743 1784 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1744 1785 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1764 1805 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1765 1806 "vmov.u8 d27, #16 \n" // Add 16 constant 1766 "1: 1807 "1: \n" 1767 1808 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1768 1809 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1788 1829 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1789 1830 "vmov.u8 d27, #16 \n" // Add 16 constant 1790 "1: 1831 "1: \n" 1791 1832 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1792 1833 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1812 1853 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1813 1854 "vmov.u8 d7, #16 \n" // Add 16 constant 1814 "1: 1855 "1: \n" 1815 1856 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. 1816 1857 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1835 1876 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1836 1877 "vmov.u8 d7, #16 \n" // Add 16 constant 1837 "1: 1878 "1: \n" 1838 1879 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. 1839 1880 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1858 1899 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 1859 1900 "vmov.u8 d7, #16 \n" // Add 16 constant 1860 "1: 1901 "1: \n" 1861 1902 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. 1862 1903 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1881 1922 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 1882 1923 "vmov.u8 d7, #16 \n" // Add 16 constant 1883 "1: 1924 "1: \n" 1884 1925 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. 1885 1926 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1904 1945 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1905 1946 "vmov.u8 d7, #16 \n" // Add 16 constant 1906 "1: 1947 "1: \n" 1907 1948 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. 1908 1949 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1939 1980 "vdup.8 d4, %4 \n" 1940 1981 // General purpose row blend. 1941 "1: 1982 "1: \n" 1942 1983 "vld1.8 {q0}, [%1]! \n" 1943 1984 "vld1.8 {q1}, [%2]! \n" … … 1954 1995 1955 1996 // Blend 50 / 50. 1956 "50: 1997 "50: \n" 1957 1998 "vld1.8 {q0}, [%1]! \n" 1958 1999 "vld1.8 {q1}, [%2]! \n" … … 1964 2005 1965 2006 // Blend 100 / 0 - Copy row unchanged. 1966 "100: 2007 "100: \n" 1967 2008 "vld1.8 {q0}, [%1]! \n" 1968 2009 "subs %3, %3, #16 \n" … … 1970 2011 "bgt 100b \n" 1971 2012 1972 "99: 2013 "99: \n" 1973 2014 : "+r"(dst_ptr), // %0 1974 2015 "+r"(src_ptr), // %1 … … 1989 2030 "blt 89f \n" 1990 2031 // Blend 8 pixels. 1991 "8: 2032 "8: \n" 1992 2033 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. 1993 2034 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. … … 2007 2048 "bge 8b \n" 2008 2049 2009 "89: 2050 "89: \n" 2010 2051 "adds %3, #8-1 \n" 2011 2052 "blt 99f \n" 2012 2053 2013 2054 // Blend 1 pixels. 2014 "1: 2055 "1: \n" 2015 2056 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. 2016 2057 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. … … 2044 2085 asm volatile( 2045 2086 // Attenuate 8 pixels. 2046 "1: 2087 "1: \n" 2047 2088 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. 2048 2089 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2076 2117 2077 2118 // 8 pixel loop. 2078 "1: 2119 "1: \n" 2079 2120 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. 2080 2121 "subs %1, %1, #8 \n" // 8 processed per loop. … … 2117 2158 2118 2159 // 8 pixel loop. 2119 "1: 2160 "1: \n" 2120 2161 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. 2121 2162 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2149 2190 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 2150 2191 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 2151 "1: 2192 "1: \n" 2152 2193 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2153 2194 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2182 2223 "vmov.u8 d29, #98 \n" // BG coefficient 2183 2224 "vmov.u8 d30, #50 \n" // BR coefficient 2184 "1: 2225 "1: \n" 2185 2226 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. 2186 2227 "subs %1, %1, #8 \n" // 8 processed per loop. … … 2218 2259 "vmovl.s8 q1, d5 \n" // R,A coefficients s16. 2219 2260 2220 "1: 2261 "1: \n" 2221 2262 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. 2222 2263 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2274 2315 asm volatile( 2275 2316 // 8 pixel loop. 2276 "1: 2317 "1: \n" 2277 2318 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 2278 2319 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB 2279 // pixels.2280 2320 "subs %3, %3, #8 \n" // 8 processed per loop. 2281 2321 "vmull.u8 q0, d0, d1 \n" // multiply B … … 2289 2329 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2290 2330 "bgt 1b \n" 2291 2292 2331 : "+r"(src_argb0), // %0 2293 2332 "+r"(src_argb1), // %1 … … 2305 2344 asm volatile( 2306 2345 // 8 pixel loop. 2307 "1: 2346 "1: \n" 2308 2347 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2309 2348 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB 2310 // pixels.2311 2349 "subs %3, %3, #8 \n" // 8 processed per loop. 2312 2350 "vqadd.u8 q0, q0, q2 \n" // add B, G … … 2314 2352 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2315 2353 "bgt 1b \n" 2316 2317 2354 : "+r"(src_argb0), // %0 2318 2355 "+r"(src_argb1), // %1 … … 2330 2367 asm volatile( 2331 2368 // 8 pixel loop. 2332 "1: 2369 "1: \n" 2333 2370 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2334 2371 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB 2335 // pixels.2336 2372 "subs %3, %3, #8 \n" // 8 processed per loop. 2337 2373 "vqsub.u8 q0, q0, q2 \n" // subtract B, G … … 2339 2375 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2340 2376 "bgt 1b \n" 2341 2342 2377 : "+r"(src_argb0), // %0 2343 2378 "+r"(src_argb1), // %1 … … 2360 2395 "vmov.u8 d3, #255 \n" // alpha 2361 2396 // 8 pixel loop. 2362 "1: 2397 "1: \n" 2363 2398 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. 2364 2399 "vld1.8 {d1}, [%1]! \n" // load 8 sobely. … … 2384 2419 asm volatile( 2385 2420 // 16 pixel loop. 2386 "1: 2421 "1: \n" 2387 2422 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. 2388 2423 "vld1.8 {q1}, [%1]! \n" // load 16 sobely. … … 2411 2446 "vmov.u8 d3, #255 \n" // alpha 2412 2447 // 8 pixel loop. 2413 "1: 2448 "1: \n" 2414 2449 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. 2415 2450 "vld1.8 {d0}, [%1]! \n" // load 8 sobely. … … 2436 2471 int width) { 2437 2472 asm volatile( 2438 "1: 2473 "1: \n" 2439 2474 "vld1.8 {d0}, [%0],%5 \n" // top 2440 2475 "vld1.8 {d1}, [%0],%6 \n" … … 2474 2509 int width) { 2475 2510 asm volatile( 2476 "1: 2511 "1: \n" 2477 2512 "vld1.8 {d0}, [%0],%4 \n" // left 2478 2513 "vld1.8 {d1}, [%1],%4 \n" … … 2506 2541 "vdup.32 q0, %3 \n" 2507 2542 2508 "1: 2543 "1: \n" 2509 2544 "vld1.8 {q1}, [%0]! \n" // load 8 shorts 2510 2545 "subs %2, %2, #8 \n" // 8 pixels per loop … … 2531 2566 "vdup.32 q0, %3 \n" 2532 2567 2533 "1: 2568 "1: \n" 2534 2569 "vld1.8 {q1}, [%0]! \n" // load 8 shorts 2535 2570 "subs %2, %2, #8 \n" // 8 pixels per loop -
pjproject/trunk/third_party/yuv/source/row_neon64.cc
r5633 r5699 274 274 asm volatile( 275 275 YUVTORGB_SETUP 276 "1: 276 "1: \n" READYUV422 YUVTORGB( 277 277 v22, v21, 278 278 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 … … 311 311 YUVTORGB_SETUP 312 312 "movi v23.8b, #255 \n" 313 "1: 313 "1: \n" READYUV422 YUVTORGB( 314 314 v22, v21, 315 315 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 … … 396 396 asm volatile( 397 397 "movi v23.8b, #255 \n" 398 "1: 398 "1: \n" 399 399 "ld1 {v20.8b}, [%0], #8 \n" 400 400 "orr v21.8b, v20.8b, v20.8b \n" … … 471 471 asm volatile( 472 472 YUVTORGB_SETUP 473 "1: 473 "1: \n" READNV12 YUVTORGB( 474 474 v22, v21, 475 475 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 … … 545 545 int width) { 546 546 asm volatile( 547 "1: 547 "1: \n" 548 548 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 549 549 "subs %w3, %w3, #16 \n" // 16 processed per loop … … 566 566 int width) { 567 567 asm volatile( 568 "1: 568 "1: \n" 569 569 "ld1 {v0.16b}, [%0], #16 \n" // load U 570 570 "ld1 {v1.16b}, [%1], #16 \n" // load V … … 581 581 } 582 582 583 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 583 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. 584 void SplitRGBRow_NEON(const uint8* src_rgb, 585 uint8* dst_r, 586 uint8* dst_g, 587 uint8* dst_b, 588 int width) { 589 asm volatile( 590 "1: \n" 591 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB 592 "subs %w4, %w4, #16 \n" // 16 processed per loop 593 "st1 {v0.16b}, [%1], #16 \n" // store R 594 "st1 {v1.16b}, [%2], #16 \n" // store G 595 "st1 {v2.16b}, [%3], #16 \n" // store B 596 "b.gt 1b \n" 597 : "+r"(src_rgb), // %0 598 "+r"(dst_r), // %1 599 "+r"(dst_g), // %2 600 "+r"(dst_b), // %3 601 "+r"(width) // %4 602 : // Input registers 603 : "cc", "memory", "v0", "v1", "v2" // Clobber List 604 ); 605 } 606 607 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time 608 void MergeRGBRow_NEON(const uint8* src_r, 609 const uint8* src_g, 610 const uint8* src_b, 611 uint8* dst_rgb, 612 int width) { 613 asm volatile( 614 "1: \n" 615 "ld1 {v0.16b}, [%0], #16 \n" // load R 616 "ld1 {v1.16b}, [%1], #16 \n" // load G 617 "ld1 {v2.16b}, [%2], #16 \n" // load B 618 "subs %w4, %w4, #16 \n" // 16 processed per loop 619 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB 620 "b.gt 1b \n" 621 : "+r"(src_r), // %0 622 "+r"(src_g), // %1 623 "+r"(src_b), // %2 624 "+r"(dst_rgb), // %3 625 "+r"(width) // %4 626 : // Input registers 627 : "cc", "memory", "v0", "v1", "v2" // Clobber List 628 ); 629 } 630 631 // Copy multiple of 32. 584 632 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 585 633 asm volatile( 586 "1: 587 "ld 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32634 "1: \n" 635 "ldp q0, q1, [%0], #32 \n" 588 636 "subs %w2, %w2, #32 \n" // 32 processed per loop 589 "st 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32590 "b.gt 1b \n" 591 : "+r"(src), 592 "+r"(dst), 593 "+r"(count) 594 : 595 : "cc", "memory", "v0", "v1" , "v2", "v3"// Clobber List637 "stp q0, q1, [%1], #32 \n" 638 "b.gt 1b \n" 639 : "+r"(src), // %0 640 "+r"(dst), // %1 641 "+r"(count) // %2 // Output registers 642 : // Input registers 643 : "cc", "memory", "v0", "v1" // Clobber List 596 644 ); 597 645 } … … 601 649 asm volatile( 602 650 "dup v0.16b, %w2 \n" // duplicate 16 bytes 603 "1: 651 "1: \n" 604 652 "subs %w1, %w1, #16 \n" // 16 bytes per loop 605 653 "st1 {v0.16b}, [%0], #16 \n" // store … … 614 662 asm volatile( 615 663 "dup v0.4s, %w2 \n" // duplicate 4 ints 616 "1: 664 "1: \n" 617 665 "subs %w1, %w1, #4 \n" // 4 ints per loop 618 666 "st1 {v0.16b}, [%0], #16 \n" // store … … 629 677 "add %0, %0, %w2, sxtw \n" 630 678 "sub %0, %0, #16 \n" 631 "1: 679 "1: \n" 632 680 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 633 681 "subs %w2, %w2, #16 \n" // 16 pixels per loop. … … 651 699 "add %0, %0, %w3, sxtw #1 \n" 652 700 "sub %0, %0, #16 \n" 653 "1: 701 "1: \n" 654 702 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 655 703 "subs %w3, %w3, #8 \n" // 8 pixels per loop. … … 672 720 "add %0, %0, %w2, sxtw #2 \n" 673 721 "sub %0, %0, #16 \n" 674 "1: 722 "1: \n" 675 723 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 676 724 "subs %w2, %w2, #4 \n" // 4 pixels per loop. … … 689 737 asm volatile( 690 738 "movi v4.8b, #255 \n" // Alpha 691 "1: 739 "1: \n" 692 740 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 693 741 "subs %w2, %w2, #8 \n" // 8 processed per loop. 694 742 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB 695 // pixels696 743 "b.gt 1b \n" 697 744 : "+r"(src_rgb24), // %0 … … 706 753 asm volatile( 707 754 "movi v5.8b, #255 \n" // Alpha 708 "1: 755 "1: \n" 709 756 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 710 757 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 723 770 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 724 771 asm volatile( 725 "1: 772 "1: \n" 726 773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 727 774 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 754 801 asm volatile( 755 802 "movi v3.8b, #255 \n" // Alpha 756 "1: 803 "1: \n" 757 804 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 758 805 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 806 RGB565TOARGB 760 807 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 761 // pixels762 808 "b.gt 1b \n" 763 809 : "+r"(src_rgb565), // %0 … … 811 857 asm volatile( 812 858 "movi v3.8b, #255 \n" // Alpha 813 "1: 859 "1: \n" 814 860 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 815 861 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 842 888 int width) { 843 889 asm volatile