Changeset 5699
- Timestamp:
- Nov 21, 2017 9:25:11 AM (7 years ago)
- Location:
- pjproject/trunk/third_party
- Files:
-
- 39 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/build/yuv/Notes.txt
r5633 r5699 1 1 Notes: 2 2 3 * Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 27 July2017.3 * Source code for libyuv from https://chromium.googlesource.com/libyuv/libyuv/ dated 17 November 2017. 4 4 5 * All code is compilable, except for compare_win.cc6 - Use older version (https://chromium.googlesource.com/libyuv/libyuv/+/baf6a3c1bd385e7ffe6b7634560e71fb49e4f589%5E%21/)7 Since there's a compiler error on:8 --------------------------------------------------------------------------------------9 pmulld xmm0,xmm610 --------------------------------------------------------------------------------------11 12 - On VS2015, error C2024: 'alignas' attribute applies to variables, data members and tag types only13 --------------------------------------------------------------------------------------14 __declspec(naked) __declspec(align(16))15 16 Change to :17 18 __declspec(naked)19 --------------------------------------------------------------------------------------20 21 * Added these lines to file include/libyuv/basic_types.h:22 --23 #if _MSC_VER==140024 # include <stdint.h> // for uint8_t25 #endif26 ...27 #if defined(_MSC_VER)28 # pragma warning(disable:4996) // This function or variable may be unsafe.29 #endif30 -- -
pjproject/trunk/third_party/yuv/include/libyuv/basic_types.h
r5633 r5699 15 15 16 16 #if defined(_MSC_VER) && (_MSC_VER < 1600) 17 #if _MSC_VER==140018 # include <stdint.h> // for uint8_t19 #endif20 17 #include <sys/types.h> // for uintptr_t on x86 21 18 #else 22 19 #include <stdint.h> // for uintptr_t 23 #endif24 25 #if defined(_MSC_VER)26 # pragma warning(disable:4996) // This function or variable may be unsafe.27 20 #endif 28 21 -
pjproject/trunk/third_party/yuv/include/libyuv/compare_row.h
r5633 r5699 20 20 21 21 #if defined(__pnacl__) || defined(__CLR_VER) || \ 22 (defined(__i386__) && !defined(__SSE 2__))22 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 23 23 #define LIBYUV_DISABLE_X86 24 24 #endif … … 43 43 #endif // __clang__ 44 44 45 // The following are available for Visual C: 45 46 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ 46 47 (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) … … 53 54 #define HAS_HASHDJB2_SSE41 54 55 #define HAS_SUMSQUAREERROR_SSE2 55 #define HAS_HAMMINGDISTANCE_ X8656 #define HAS_HAMMINGDISTANCE_SSE42 56 57 #endif 57 58 … … 63 64 #endif 64 65 66 // The following are available for GCC and clangcl 64 bit: 67 #if !defined(LIBYUV_DISABLE_X86) && \ 68 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 69 #define HAS_HAMMINGDISTANCE_SSSE3 70 #endif 71 72 // The following are available for GCC and clangcl 64 bit: 73 #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ 74 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 75 #define HAS_HAMMINGDISTANCE_AVX2 76 #endif 77 65 78 // The following are available for Neon: 66 79 #if !defined(LIBYUV_DISABLE_NEON) && \ … … 70 83 #endif 71 84 85 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 86 #define HAS_HAMMINGDISTANCE_MSA 87 #define HAS_SUMSQUAREERROR_MSA 88 #endif 89 72 90 uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); 73 uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); 91 uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count); 92 uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); 93 uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); 74 94 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); 95 uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); 75 96 76 97 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); … … 78 99 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); 79 100 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); 101 uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count); 80 102 81 103 uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); -
pjproject/trunk/third_party/yuv/include/libyuv/convert_from.h
r5633 r5699 179 179 LIBYUV_API 180 180 int I420ToRAW(const uint8* src_y, 181 int src_stride_y, 182 const uint8* src_u, 183 int src_stride_u, 184 const uint8* src_v, 185 int src_stride_v, 186 uint8* dst_frame, 187 int dst_stride_frame, 188 int width, 189 int height); 190 191 LIBYUV_API 192 int H420ToRGB24(const uint8* src_y, 193 int src_stride_y, 194 const uint8* src_u, 195 int src_stride_u, 196 const uint8* src_v, 197 int src_stride_v, 198 uint8* dst_frame, 199 int dst_stride_frame, 200 int width, 201 int height); 202 203 LIBYUV_API 204 int H420ToRAW(const uint8* src_y, 181 205 int src_stride_y, 182 206 const uint8* src_u, -
pjproject/trunk/third_party/yuv/include/libyuv/cpu_id.h
r5633 r5699 37 37 static const int kCpuHasERMS = 0x800; 38 38 static const int kCpuHasFMA3 = 0x1000; 39 static const int kCpuHasAVX3 = 0x2000; 40 static const int kCpuHasF16C = 0x4000; 41 42 // 0x8000 reserved for future X86 flags. 39 static const int kCpuHasF16C = 0x2000; 40 static const int kCpuHasGFNI = 0x4000; 41 static const int kCpuHasAVX512BW = 0x8000; 42 static const int kCpuHasAVX512VL = 0x10000; 43 static const int kCpuHasAVX512VBMI = 0x20000; 44 static const int kCpuHasAVX512VBMI2 = 0x40000; 45 static const int kCpuHasAVX512VBITALG = 0x80000; 46 static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; 43 47 44 48 // These flags are only valid on MIPS processors. 45 static const int kCpuHasMIPS = 0x 10000;46 static const int kCpuHasDSPR2 = 0x 20000;47 static const int kCpuHasMSA = 0x 40000;49 static const int kCpuHasMIPS = 0x200000; 50 static const int kCpuHasDSPR2 = 0x400000; 51 static const int kCpuHasMSA = 0x800000; 48 52 49 53 // Optional init function. TestCpuFlag does an auto-init. -
pjproject/trunk/third_party/yuv/include/libyuv/planar_functions.h
r5633 r5699 69 69 int width, 70 70 int height); 71 72 // Split interleaved RGB plane into separate R, G and B planes. 73 LIBYUV_API 74 void SplitRGBPlane(const uint8* src_rgb, 75 int src_stride_rgb, 76 uint8* dst_r, 77 int dst_stride_r, 78 uint8* dst_g, 79 int dst_stride_g, 80 uint8* dst_b, 81 int dst_stride_b, 82 int width, 83 int height); 84 85 // Merge separate R, G and B planes into one interleaved RGB plane. 86 LIBYUV_API 87 void MergeRGBPlane(const uint8* src_r, 88 int src_stride_r, 89 const uint8* src_g, 90 int src_stride_g, 91 const uint8* src_b, 92 int src_stride_b, 93 uint8* dst_rgb, 94 int dst_stride_rgb, 95 int width, 96 int height); 71 97 72 98 // Copy I400. Supports inverting. … … 721 747 722 748 #if defined(__pnacl__) || defined(__CLR_VER) || \ 723 (defined(__i386__) && !defined(__SSE 2__))749 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 724 750 #define LIBYUV_DISABLE_X86 725 751 #endif -
pjproject/trunk/third_party/yuv/include/libyuv/rotate_row.h
r5633 r5699 20 20 21 21 #if defined(__pnacl__) || defined(__CLR_VER) || \ 22 (defined(__i386__) && !defined(__SSE 2__))22 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 23 23 #define LIBYUV_DISABLE_X86 24 24 #endif … … 30 30 #endif 31 31 // The following are available for Visual C and clangcl 32 bit: 32 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 32 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 33 33 #define HAS_TRANSPOSEWX8_SSSE3 34 34 #define HAS_TRANSPOSEUVWX8_SSE2 -
pjproject/trunk/third_party/yuv/include/libyuv/row.h
r5633 r5699 32 32 33 33 #if defined(__pnacl__) || defined(__CLR_VER) || \ 34 (defined(__i386__) && !defined(__SSE 2__))34 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 35 35 #define LIBYUV_DISABLE_X86 36 36 #endif … … 265 265 #endif 266 266 267 // The following are available for gcc/clang x86 platforms: 268 // TODO(fbarchard): Port to Visual C 269 #if !defined(LIBYUV_DISABLE_X86) && \ 270 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 271 #define HAS_MERGERGBROW_SSSE3 272 #define HAS_SPLITRGBROW_SSSE3 273 #endif 274 275 // The following are available for AVX2 gcc/clang x86 platforms: 276 // TODO(fbarchard): Port to Visual C 277 #if !defined(LIBYUV_DISABLE_X86) && \ 278 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ 279 (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) 280 #define HAS_MERGEUVROW_16_AVX2 281 #define HAS_MULTIPLYROW_16_AVX2 282 #endif 283 267 284 // The following are available on Neon platforms: 268 285 #if !defined(LIBYUV_DISABLE_NEON) && \ … … 324 341 #define HAS_RGBATOYROW_NEON 325 342 #define HAS_SETROW_NEON 343 #define HAS_SPLITRGBROW_NEON 326 344 #define HAS_SPLITUVROW_NEON 327 345 #define HAS_UYVYTOARGBROW_NEON … … 353 371 #define HAS_SOBELXYROW_NEON 354 372 #define HAS_SOBELYROW_NEON 373 #endif 374 375 // The following are available on AArch64 platforms: 376 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 377 #define HAS_SCALESUMSAMPLES_NEON 355 378 #endif 356 379 … … 386 409 387 410 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 411 #define HAS_ABGRTOUVROW_MSA 412 #define HAS_ABGRTOYROW_MSA 413 #define HAS_ARGB1555TOARGBROW_MSA 414 #define HAS_ARGB1555TOUVROW_MSA 415 #define HAS_ARGB1555TOYROW_MSA 416 #define HAS_ARGB4444TOARGBROW_MSA 417 #define HAS_ARGBADDROW_MSA 418 #define HAS_ARGBATTENUATEROW_MSA 419 #define HAS_ARGBBLENDROW_MSA 420 #define HAS_ARGBCOLORMATRIXROW_MSA 421 #define HAS_ARGBEXTRACTALPHAROW_MSA 422 #define HAS_ARGBGRAYROW_MSA 388 423 #define HAS_ARGBMIRRORROW_MSA 424 #define HAS_ARGBMULTIPLYROW_MSA 425 #define HAS_ARGBQUANTIZEROW_MSA 426 #define HAS_ARGBSEPIAROW_MSA 427 #define HAS_ARGBSETROW_MSA 428 #define HAS_ARGBSHADEROW_MSA 429 #define HAS_ARGBSHUFFLEROW_MSA 430 #define HAS_ARGBSUBTRACTROW_MSA 431 #define HAS_ARGBTOARGB1555ROW_MSA 432 #define HAS_ARGBTOARGB4444ROW_MSA 433 #define HAS_ARGBTORAWROW_MSA 434 #define HAS_ARGBTORGB24ROW_MSA 435 #define HAS_ARGBTORGB565DITHERROW_MSA 436 #define HAS_ARGBTORGB565ROW_MSA 437 #define HAS_ARGBTOUV444ROW_MSA 438 #define HAS_ARGBTOUVJROW_MSA 439 #define HAS_ARGBTOUVROW_MSA 440 #define HAS_ARGBTOYJROW_MSA 441 #define HAS_ARGBTOYROW_MSA 442 #define HAS_BGRATOUVROW_MSA 443 #define HAS_BGRATOYROW_MSA 444 #define HAS_HALFFLOATROW_MSA 445 #define HAS_I400TOARGBROW_MSA 446 #define HAS_I422ALPHATOARGBROW_MSA 447 #define HAS_I422TOARGBROW_MSA 448 #define HAS_I422TORGB24ROW_MSA 449 #define HAS_I422TORGBAROW_MSA 389 450 #define HAS_I422TOUYVYROW_MSA 390 451 #define HAS_I422TOYUY2ROW_MSA 452 #define HAS_I444TOARGBROW_MSA 453 #define HAS_INTERPOLATEROW_MSA 454 #define HAS_J400TOARGBROW_MSA 455 #define HAS_MERGEUVROW_MSA 391 456 #define HAS_MIRRORROW_MSA 457 #define HAS_MIRRORUVROW_MSA 458 #define HAS_NV12TOARGBROW_MSA 459 #define HAS_NV12TORGB565ROW_MSA 460 #define HAS_NV21TOARGBROW_MSA 461 #define HAS_RAWTOARGBROW_MSA 462 #define HAS_RAWTORGB24ROW_MSA 463 #define HAS_RAWTOUVROW_MSA 464 #define HAS_RAWTOYROW_MSA 465 #define HAS_RGB24TOARGBROW_MSA 466 #define HAS_RGB24TOUVROW_MSA 467 #define HAS_RGB24TOYROW_MSA 468 #define HAS_RGB565TOARGBROW_MSA 469 #define HAS_RGB565TOUVROW_MSA 470 #define HAS_RGB565TOYROW_MSA 471 #define HAS_RGBATOUVROW_MSA 472 #define HAS_RGBATOYROW_MSA 473 #define HAS_SETROW_MSA 474 #define HAS_SOBELROW_MSA 475 #define HAS_SOBELTOPLANEROW_MSA 476 #define HAS_SOBELXROW_MSA 477 #define HAS_SOBELXYROW_MSA 478 #define HAS_SOBELYROW_MSA 479 #define HAS_SPLITUVROW_MSA 480 #define HAS_UYVYTOARGBROW_MSA 392 481 #define HAS_UYVYTOUVROW_MSA 393 482 #define HAS_UYVYTOYROW_MSA 483 #define HAS_YUY2TOARGBROW_MSA 394 484 #define HAS_YUY2TOUV422ROW_MSA 395 485 #define HAS_YUY2TOUVROW_MSA 396 486 #define HAS_YUY2TOYROW_MSA 397 #define HAS_ARGB4444TOARGBROW_MSA398 #define HAS_ARGBTOYROW_MSA399 #define HAS_ARGBTOUVROW_MSA400 #define HAS_I422TOARGBROW_MSA401 #define HAS_I422TORGBAROW_MSA402 #define HAS_I422ALPHATOARGBROW_MSA403 #define HAS_I422TORGB24ROW_MSA404 #define HAS_ARGBTORGB24ROW_MSA405 #define HAS_ARGBTORAWROW_MSA406 #define HAS_ARGBTORGB565ROW_MSA407 #define HAS_ARGBTOARGB1555ROW_MSA408 #define HAS_ARGBTOARGB4444ROW_MSA409 #define HAS_ARGBTOUV444ROW_MSA410 #define HAS_ARGBMULTIPLYROW_MSA411 #define HAS_ARGBADDROW_MSA412 #define HAS_ARGBSUBTRACTROW_MSA413 #define HAS_ARGBATTENUATEROW_MSA414 #define HAS_ARGBTORGB565DITHERROW_MSA415 #define HAS_ARGBSHUFFLEROW_MSA416 #define HAS_ARGBSHADEROW_MSA417 #define HAS_ARGBGRAYROW_MSA418 #define HAS_ARGBSEPIAROW_MSA419 #define HAS_ARGB1555TOARGBROW_MSA420 #define HAS_RGB565TOARGBROW_MSA421 #define HAS_RGB24TOARGBROW_MSA422 #define HAS_RAWTOARGBROW_MSA423 #define HAS_ARGB1555TOYROW_MSA424 #define HAS_RGB565TOYROW_MSA425 #define HAS_RGB24TOYROW_MSA426 #define HAS_RAWTOYROW_MSA427 #define HAS_ARGB1555TOUVROW_MSA428 #define HAS_RGB565TOUVROW_MSA429 #define HAS_RGB24TOUVROW_MSA430 #define HAS_RAWTOUVROW_MSA431 #define HAS_NV12TOARGBROW_MSA432 #define HAS_NV12TORGB565ROW_MSA433 #define HAS_NV21TOARGBROW_MSA434 #define HAS_SOBELROW_MSA435 #define HAS_SOBELTOPLANEROW_MSA436 #define HAS_SOBELXYROW_MSA437 #define HAS_ARGBTOYJROW_MSA438 #define HAS_BGRATOYROW_MSA439 #define HAS_ABGRTOYROW_MSA440 #define HAS_RGBATOYROW_MSA441 #define HAS_ARGBTOUVJROW_MSA442 #define HAS_BGRATOUVROW_MSA443 #define HAS_ABGRTOUVROW_MSA444 #define HAS_RGBATOUVROW_MSA445 #define HAS_I444TOARGBROW_MSA446 #define HAS_I400TOARGBROW_MSA447 #define HAS_J400TOARGBROW_MSA448 #define HAS_YUY2TOARGBROW_MSA449 #define HAS_UYVYTOARGBROW_MSA450 #define HAS_INTERPOLATEROW_MSA451 #define HAS_ARGBSETROW_MSA452 #define HAS_RAWTORGB24ROW_MSA453 #define HAS_MERGEUVROW_MSA454 487 #endif 455 488 … … 1346 1379 uint8* dst_v, 1347 1380 int width); 1381 void MirrorUVRow_MSA(const uint8* src_uv, 1382 uint8* dst_u, 1383 uint8* dst_v, 1384 int width); 1348 1385 void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); 1349 1386 … … 1375 1412 uint8* dst_v, 1376 1413 int width); 1414 void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); 1377 1415 void SplitUVRow_Any_SSE2(const uint8* src_uv, 1378 1416 uint8* dst_u, … … 1391 1429 uint8* dst_v, 1392 1430 int width); 1431 void SplitUVRow_Any_MSA(const uint8* src_uv, 1432 uint8* dst_u, 1433 uint8* dst_v, 1434 int width); 1393 1435 1394 1436 void MergeUVRow_C(const uint8* src_u, … … 1429 1471 int width); 1430 1472 1473 void SplitRGBRow_C(const uint8* src_rgb, 1474 uint8* dst_r, 1475 uint8* dst_g, 1476 uint8* dst_b, 1477 int width); 1478 void SplitRGBRow_SSSE3(const uint8* src_rgb, 1479 uint8* dst_r, 1480 uint8* dst_g, 1481 uint8* dst_b, 1482 int width); 1483 void SplitRGBRow_NEON(const uint8* src_rgb, 1484 uint8* dst_r, 1485 uint8* dst_g, 1486 uint8* dst_b, 1487 int width); 1488 void SplitRGBRow_Any_SSSE3(const uint8* src_rgb, 1489 uint8* dst_r, 1490 uint8* dst_g, 1491 uint8* dst_b, 1492 int width); 1493 void SplitRGBRow_Any_NEON(const uint8* src_rgb, 1494 uint8* dst_r, 1495 uint8* dst_g, 1496 uint8* dst_b, 1497 int width); 1498 1499 void MergeRGBRow_C(const uint8* src_r, 1500 const uint8* src_g, 1501 const uint8* src_b, 1502 uint8* dst_rgb, 1503 int width); 1504 void MergeRGBRow_SSSE3(const uint8* src_r, 1505 const uint8* src_g, 1506 const uint8* src_b, 1507 uint8* dst_rgb, 1508 int width); 1509 void MergeRGBRow_NEON(const uint8* src_r, 1510 const uint8* src_g, 1511 const uint8* src_b, 1512 uint8* dst_rgb, 1513 int width); 1514 void MergeRGBRow_Any_SSSE3(const uint8* src_r, 1515 const uint8* src_g, 1516 const uint8* src_b, 1517 uint8* dst_rgb, 1518 int width); 1519 void MergeRGBRow_Any_NEON(const uint8* src_r, 1520 const uint8* src_g, 1521 const uint8* src_b, 1522 uint8* dst_rgb, 1523 int width); 1524 1525 void MergeUVRow_16_C(const uint16* src_u, 1526 const uint16* src_v, 1527 uint16* dst_uv, 1528 int scale, /* 64 for 10 bit */ 1529 int width); 1530 void MergeUVRow_16_AVX2(const uint16* src_u, 1531 const uint16* src_v, 1532 uint16* dst_uv, 1533 int scale, 1534 int width); 1535 1536 void MultiplyRow_16_AVX2(const uint16* src_y, 1537 uint16* dst_y, 1538 int scale, 1539 int width); 1540 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); 1541 1431 1542 void CopyRow_SSE2(const uint8* src, uint8* dst, int count); 1432 1543 void CopyRow_AVX(const uint8* src, uint8* dst, int count); … … 1455 1566 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width); 1456 1567 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); 1568 void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width); 1457 1569 void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, 1458 1570 uint8* dst_a, … … 1464 1576 uint8* dst_a, 1465 1577 int width); 1578 void ARGBExtractAlphaRow_Any_MSA(const uint8* src_argb, 1579 uint8* dst_a, 1580 int width); 1466 1581 1467 1582 void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); … … 1476 1591 1477 1592 void SetRow_C(uint8* dst, uint8 v8, int count); 1593 void SetRow_MSA(uint8* dst, uint8 v8, int count); 1478 1594 void SetRow_X86(uint8* dst, uint8 v8, int count); 1479 1595 void SetRow_ERMS(uint8* dst, uint8 v8, int count); … … 2123 2239 uint8* dst_argb, 2124 2240 int width); 2241 void ARGBBlendRow_MSA(const uint8* src_argb, 2242 const uint8* src_argb1, 2243 uint8* dst_argb, 2244 int width); 2125 2245 void ARGBBlendRow_C(const uint8* src_argb, 2126 2246 const uint8* src_argb1, … … 2836 2956 const int8* matrix_argb, 2837 2957 int width); 2958 void ARGBColorMatrixRow_MSA(const uint8* src_argb, 2959 uint8* dst_argb, 2960 const int8* matrix_argb, 2961 int width); 2838 2962 2839 2963 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); … … 2858 2982 int interval_offset, 2859 2983 int width); 2984 void ARGBQuantizeRow_MSA(uint8* dst_argb, 2985 int scale, 2986 int interval_size, 2987 int interval_offset, 2988 int width); 2860 2989 2861 2990 void ARGBShadeRow_C(const uint8* src_argb, … … 2991 3120 uint8* dst_sobelx, 2992 3121 int width); 3122 void SobelXRow_MSA(const uint8* src_y0, 3123 const uint8* src_y1, 3124 const uint8* src_y2, 3125 uint8* dst_sobelx, 3126 int width); 2993 3127 void SobelYRow_C(const uint8* src_y0, 2994 3128 const uint8* src_y1, … … 3003 3137 uint8* dst_sobely, 3004 3138 int width); 3139 void SobelYRow_MSA(const uint8* src_y0, 3140 const uint8* src_y1, 3141 uint8* dst_sobely, 3142 int width); 3005 3143 void SobelRow_C(const uint8* src_sobelx, 3006 3144 const uint8* src_sobely, … … 3133 3271 float scale, 3134 3272 int width); 3273 void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width); 3274 void HalfFloatRow_Any_MSA(const uint16* src, 3275 uint16* dst, 3276 float scale, 3277 int width); 3135 3278 3136 3279 void ARGBLumaColorTableRow_C(const uint8* src_argb, … … 3145 3288 uint32 lumacoeff); 3146 3289 3290 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); 3291 float ScaleMaxSamples_NEON(const float* src, 3292 float* dst, 3293 float scale, 3294 int width); 3295 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); 3296 float ScaleSumSamples_NEON(const float* src, 3297 float* dst, 3298 float scale, 3299 int width); 3300 void ScaleSamples_C(const float* src, float* dst, float scale, int width); 3301 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); 3302 3147 3303 #ifdef __cplusplus 3148 3304 } // extern "C" -
pjproject/trunk/third_party/yuv/include/libyuv/scale_row.h
r5633 r5699 21 21 22 22 #if defined(__pnacl__) || defined(__CLR_VER) || \ 23 (defined(__i386__) && !defined(__SSE 2__))23 (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) 24 24 #define LIBYUV_DISABLE_X86 25 25 #endif … … 106 106 107 107 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 108 #define HAS_SCALEADDROW_MSA 109 #define HAS_SCALEARGBCOLS_MSA 110 #define HAS_SCALEARGBFILTERCOLS_MSA 108 111 #define HAS_SCALEARGBROWDOWN2_MSA 109 112 #define HAS_SCALEARGBROWDOWNEVEN_MSA 113 #define HAS_SCALEFILTERCOLS_MSA 110 114 #define HAS_SCALEROWDOWN2_MSA 115 #define HAS_SCALEROWDOWN34_MSA 116 #define HAS_SCALEROWDOWN38_MSA 111 117 #define HAS_SCALEROWDOWN4_MSA 112 #define HAS_SCALEROWDOWN38_MSA113 #define HAS_SCALEADDROW_MSA114 118 #endif 115 119 … … 547 551 int x, 548 552 int dx); 553 void ScaleARGBFilterCols_MSA(uint8* dst_argb, 554 const uint8* src_argb, 555 int dst_width, 556 int x, 557 int dx); 558 void ScaleARGBCols_MSA(uint8* dst_argb, 559 const uint8* src_argb, 560 int dst_width, 561 int x, 562 int dx); 563 void ScaleARGBFilterCols_Any_MSA(uint8* dst_argb, 564 const uint8* src_argb, 565 int dst_width, 566 int x, 567 int dx); 568 void ScaleARGBCols_Any_MSA(uint8* dst_argb, 569 const uint8* src_argb, 570 int dst_width, 571 int x, 572 int dx); 549 573 550 574 // ARGB Row functions … … 886 910 int dst_width); 887 911 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); 912 void ScaleFilterCols_MSA(uint8* dst_ptr, 913 const uint8* src_ptr, 914 int dst_width, 915 int x, 916 int dx); 917 void ScaleRowDown34_MSA(const uint8* src_ptr, 918 ptrdiff_t src_stride, 919 uint8* dst_ptr, 920 int dst_width); 921 void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr, 922 ptrdiff_t src_stride, 923 uint8* dst_ptr, 924 int dst_width); 925 void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr, 926 ptrdiff_t src_stride, 927 uint8* dst_ptr, 928 int dst_width); 929 888 930 void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, 889 931 ptrdiff_t src_stride, … … 921 963 uint16_t* dst_ptr, 922 964 int src_width); 965 void ScaleFilterCols_Any_MSA(uint8* dst_ptr, 966 const uint8* src_ptr, 967 int dst_width, 968 int x, 969 int dx); 970 void ScaleRowDown34_Any_MSA(const uint8* src_ptr, 971 ptrdiff_t src_stride, 972 uint8* dst_ptr, 973 int dst_width); 974 void ScaleRowDown34_0_Box_Any_MSA(const uint8* src_ptr, 975 ptrdiff_t src_stride, 976 uint8* dst_ptr, 977 int dst_width); 978 void ScaleRowDown34_1_Box_Any_MSA(const uint8* src_ptr, 979 ptrdiff_t src_stride, 980 uint8* dst_ptr, 981 int dst_width); 923 982 924 983 #ifdef __cplusplus -
pjproject/trunk/third_party/yuv/include/libyuv/version.h
r5633 r5699 12 12 #define INCLUDE_LIBYUV_VERSION_H_ 13 13 14 #define LIBYUV_VERSION 16 6214 #define LIBYUV_VERSION 1678 15 15 16 16 #endif // INCLUDE_LIBYUV_VERSION_H_ -
pjproject/trunk/third_party/yuv/source/compare.cc
r5633 r5699 111 111 } 112 112 113 // NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. 114 // So actual maximum is 1 less loop, which is 64436 - 32 bytes. 115 113 116 LIBYUV_API 114 117 uint64 ComputeHammingDistance(const uint8* src_a, 115 118 const uint8* src_b, 116 119 int count) { 117 const int kBlockSize = 65536; 118 int remainder = count & (kBlockSize - 1) & ~31; 120 const int kBlockSize = 1 << 15; // 32768; 121 const int kSimdSize = 64; 122 // SIMD for multiple of 64, and C for remainder 123 int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); 119 124 uint64 diff = 0; 120 125 int i; … … 126 131 } 127 132 #endif 128 #if defined(HAS_HAMMINGDISTANCE_X86) 129 if (TestCpuFlag(kCpuHasX86)) { 130 HammingDistance = HammingDistance_X86; 133 #if defined(HAS_HAMMINGDISTANCE_SSSE3) 134 if (TestCpuFlag(kCpuHasSSSE3)) { 135 HammingDistance = HammingDistance_SSSE3; 136 } 137 #endif 138 #if defined(HAS_HAMMINGDISTANCE_SSE42) 139 if (TestCpuFlag(kCpuHasSSE42)) { 140 HammingDistance = HammingDistance_SSE42; 131 141 } 132 142 #endif … … 136 146 } 137 147 #endif 148 #if defined(HAS_HAMMINGDISTANCE_MSA) 149 if (TestCpuFlag(kCpuHasMSA)) { 150 HammingDistance = HammingDistance_MSA; 151 } 152 #endif 138 153 #ifdef _OPENMP 139 154 #pragma omp parallel for reduction(+ : diff) … … 149 164 src_b += remainder; 150 165 } 151 remainder = count & 31;166 remainder = count & (kSimdSize - 1); 152 167 if (remainder) { 153 168 diff += HammingDistance_C(src_a, src_b, remainder); … … 185 200 // Note only used for multiples of 32 so count is not checked. 186 201 SumSquareError = SumSquareError_AVX2; 202 } 203 #endif 204 #if defined(HAS_SUMSQUAREERROR_MSA) 205 if (TestCpuFlag(kCpuHasMSA)) { 206 SumSquareError = SumSquareError_MSA; 187 207 } 188 208 #endif -
pjproject/trunk/third_party/yuv/source/compare_common.cc
r5633 r5699 19 19 20 20 #if ORIGINAL_OPT 21 uint32 HammingDistance_C (const uint8* src_a, const uint8* src_b, int count) {21 uint32 HammingDistance_C1(const uint8* src_a, const uint8* src_b, int count) { 22 22 uint32 diff = 0u; 23 23 … … 59 59 src_b += 4; 60 60 } 61 62 for (; i < count; ++i) { 63 uint32 x = *src_a ^ *src_b; 64 uint32 u = x - ((x >> 1) & 0x55); 65 u = ((u >> 2) & 0x33) + (u & 0x33); 66 diff += (u + (u >> 4)) & 0x0f; 67 src_a += 1; 68 src_b += 1; 69 } 70 61 71 return diff; 62 72 } -
pjproject/trunk/third_party/yuv/source/compare_gcc.cc
r5633 r5699 23 23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 24 24 25 uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { 25 #if defined(__x86_64__) 26 uint32 HammingDistance_SSE42(const uint8* src_a, 27 const uint8* src_b, 28 int count) { 29 uint64 diff = 0u; 30 31 asm volatile( 32 "xor %3,%3 \n" 33 "xor %%r8,%%r8 \n" 34 "xor %%r9,%%r9 \n" 35 "xor %%r10,%%r10 \n" 36 37 // Process 32 bytes per loop. 38 LABELALIGN 39 "1: \n" 40 "mov (%0),%%rcx \n" 41 "mov 0x8(%0),%%rdx \n" 42 "xor (%1),%%rcx \n" 43 "xor 0x8(%1),%%rdx \n" 44 "popcnt %%rcx,%%rcx \n" 45 "popcnt %%rdx,%%rdx \n" 46 "mov 0x10(%0),%%rsi \n" 47 "mov 0x18(%0),%%rdi \n" 48 "xor 0x10(%1),%%rsi \n" 49 "xor 0x18(%1),%%rdi \n" 50 "popcnt %%rsi,%%rsi \n" 51 "popcnt %%rdi,%%rdi \n" 52 "add $0x20,%0 \n" 53 "add $0x20,%1 \n" 54 "add %%rcx,%3 \n" 55 "add %%rdx,%%r8 \n" 56 "add %%rsi,%%r9 \n" 57 "add %%rdi,%%r10 \n" 58 "sub $0x20,%2 \n" 59 "jg 1b \n" 60 61 "add %%r8, %3 \n" 62 "add %%r9, %3 \n" 63 "add %%r10, %3 \n" 64 : "+r"(src_a), // %0 65 "+r"(src_b), // %1 66 "+r"(count), // %2 67 "=r"(diff) // %3 68 : 69 : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); 70 71 return static_cast<uint32>(diff); 72 } 73 #else 74 uint32 HammingDistance_SSE42(const uint8* src_a, 75 const uint8* src_b, 76 int count) { 26 77 uint32 diff = 0u; 27 78 28 int i; 29 for (i = 0; i < count - 7; i += 8) { 30 uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); 31 src_a += 8; 32 src_b += 8; 33 diff += __builtin_popcountll(x); 34 } 79 asm volatile( 80 // Process 16 bytes per loop. 81 LABELALIGN 82 "1: \n" 83 "mov (%0),%%ecx \n" 84 "mov 0x4(%0),%%edx \n" 85 "xor (%1),%%ecx \n" 86 "xor 0x4(%1),%%edx \n" 87 "popcnt %%ecx,%%ecx \n" 88 "add %%ecx,%3 \n" 89 "popcnt %%edx,%%edx \n" 90 "add %%edx,%3 \n" 91 "mov 0x8(%0),%%ecx \n" 92 "mov 0xc(%0),%%edx \n" 93 "xor 0x8(%1),%%ecx \n" 94 "xor 0xc(%1),%%edx \n" 95 "popcnt %%ecx,%%ecx \n" 96 "add %%ecx,%3 \n" 97 "popcnt %%edx,%%edx \n" 98 "add %%edx,%3 \n" 99 "add $0x10,%0 \n" 100 "add $0x10,%1 \n" 101 "sub $0x10,%2 \n" 102 "jg 1b \n" 103 : "+r"(src_a), // %0 104 "+r"(src_b), // %1 105 "+r"(count), // %2 106 "+r"(diff) // %3 107 : 108 : "memory", "cc", "ecx", "edx"); 109 35 110 return diff; 36 111 } 112 #endif 113 114 static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 115 15, 15, 15, 15, 15, 15, 15, 15}; 116 static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 117 118 uint32 HammingDistance_SSSE3(const uint8* src_a, 119 const uint8* src_b, 120 int count) { 121 uint32 diff = 0u; 122 123 asm volatile( 124 "movdqa %4,%%xmm2 \n" 125 "movdqa %5,%%xmm3 \n" 126 "pxor %%xmm0,%%xmm0 \n" 127 "pxor %%xmm1,%%xmm1 \n" 128 "sub %0,%1 \n" 129 130 LABELALIGN 131 "1: \n" 132 "movdqa (%0),%%xmm4 \n" 133 "movdqa 0x10(%0), %%xmm5 \n" 134 "pxor (%0,%1), %%xmm4 \n" 135 "movdqa %%xmm4,%%xmm6 \n" 136 "pand %%xmm2,%%xmm6 \n" 137 "psrlw $0x4,%%xmm4 \n" 138 "movdqa %%xmm3,%%xmm7 \n" 139 "pshufb %%xmm6,%%xmm7 \n" 140 "pand %%xmm2,%%xmm4 \n" 141 "movdqa %%xmm3,%%xmm6 \n" 142 "pshufb %%xmm4,%%xmm6 \n" 143 "paddb %%xmm7,%%xmm6 \n" 144 "pxor 0x10(%0,%1),%%xmm5 \n" 145 "add $0x20,%0 \n" 146 "movdqa %%xmm5,%%xmm4 \n" 147 "pand %%xmm2,%%xmm5 \n" 148 "psrlw $0x4,%%xmm4 \n" 149 "movdqa %%xmm3,%%xmm7 \n" 150 "pshufb %%xmm5,%%xmm7 \n" 151 "pand %%xmm2,%%xmm4 \n" 152 "movdqa %%xmm3,%%xmm5 \n" 153 "pshufb %%xmm4,%%xmm5 \n" 154 "paddb %%xmm7,%%xmm5 \n" 155 "paddb %%xmm5,%%xmm6 \n" 156 "psadbw %%xmm1,%%xmm6 \n" 157 "paddd %%xmm6,%%xmm0 \n" 158 "sub $0x20,%2 \n" 159 "jg 1b \n" 160 161 "pshufd $0xaa,%%xmm0,%%xmm1 \n" 162 "paddd %%xmm1,%%xmm0 \n" 163 "movd %%xmm0, %3 \n" 164 : "+r"(src_a), // %0 165 "+r"(src_b), // %1 166 "+r"(count), // %2 167 "=r"(diff) // %3 168 : "m"(kNibbleMask), // %4 169 "m"(kBitCount) // %5 170 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 171 "xmm7"); 172 173 return diff; 174 } 175 176 #ifdef HAS_HAMMINGDISTANCE_AVX2 177 uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { 178 uint32 diff = 0u; 179 180 asm volatile( 181 "vbroadcastf128 %4,%%ymm2 \n" 182 "vbroadcastf128 %5,%%ymm3 \n" 183 "vpxor %%ymm0,%%ymm0,%%ymm0 \n" 184 "vpxor %%ymm1,%%ymm1,%%ymm1 \n" 185 "sub %0,%1 \n" 186 187 LABELALIGN 188 "1: \n" 189 "vmovdqa (%0),%%ymm4 \n" 190 "vmovdqa 0x20(%0), %%ymm5 \n" 191 "vpxor (%0,%1), %%ymm4, %%ymm4 \n" 192 "vpand %%ymm2,%%ymm4,%%ymm6 \n" 193 "vpsrlw $0x4,%%ymm4,%%ymm4 \n" 194 "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" 195 "vpand %%ymm2,%%ymm4,%%ymm4 \n" 196 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" 197 "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" 198 "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" 199 "add $0x40,%0 \n" 200 "vpand %%ymm2,%%ymm4,%%ymm5 \n" 201 "vpsrlw $0x4,%%ymm4,%%ymm4 \n" 202 "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" 203 "vpand %%ymm2,%%ymm4,%%ymm4 \n" 204 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" 205 "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" 206 "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" 207 "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" 208 "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" 209 "sub $0x40,%2 \n" 210 "jg 1b \n" 211 212 "vpermq $0xb1,%%ymm0,%%ymm1 \n" 213 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" 214 "vpermq $0xaa,%%ymm0,%%ymm1 \n" 215 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" 216 "vmovd %%xmm0, %3 \n" 217 "vzeroupper \n" 218 : "+r"(src_a), // %0 219 "+r"(src_b), // %1 220 "+r"(count), // %2 221 "=r"(diff) // %3 222 : "m"(kNibbleMask), // %4 223 "m"(kBitCount) // %5 224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); 225 226 return diff; 227 } 228 #endif // HAS_HAMMINGDISTANCE_AVX2 37 229 38 230 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { -
pjproject/trunk/third_party/yuv/source/compare_neon.cc
r5633 r5699 27 27 uint32 diff; 28 28 29 asm volatile 30 "vmov.u16 q4, #0 \n" // accumulator29 asm volatile( 30 "vmov.u16 q4, #0 \n" // accumulator 31 31 32 "1:\n"33 "vld1.8 {q0, q1}, [%0]! \n"34 "vld1.8 {q2, q3}, [%1]! \n"35 "veor.32 q0, q0, q2 \n"36 "veor.32 q1, q1, q3 \n"37 "vcnt.i8 q0, q0 \n"38 "vcnt.i8 q1, q1 \n"39 "subs %2, %2, #32 \n"40 "vadd.u8 q0, q0, q1 \n" // 16 byte counts41 "vpadal.u8 q4, q0 \n" // 8 shorts42 "bgt 1b \n"32 "1: \n" 33 "vld1.8 {q0, q1}, [%0]! \n" 34 "vld1.8 {q2, q3}, [%1]! \n" 35 "veor.32 q0, q0, q2 \n" 36 "veor.32 q1, q1, q3 \n" 37 "vcnt.i8 q0, q0 \n" 38 "vcnt.i8 q1, q1 \n" 39 "subs %2, %2, #32 \n" 40 "vadd.u8 q0, q0, q1 \n" // 16 byte counts 41 "vpadal.u8 q4, q0 \n" // 8 shorts 42 "bgt 1b \n" 43 43 44 "vpaddl.u16 q0, q4 \n" // 4 ints 45 "vpadd.u32 d0, d0, d1 \n" 46 "vpadd.u32 d0, d0, d0 \n" 47 "vmov.32 %3, d0[0] \n" 48 49 : "+r"(src_a), 50 "+r"(src_b), 51 "+r"(count), 52 "=r"(diff) 53 : 54 : "cc", "q0", "q1", "q2", "q3", "q4"); 44 "vpaddl.u16 q0, q4 \n" // 4 ints 45 "vpadd.u32 d0, d0, d1 \n" 46 "vpadd.u32 d0, d0, d0 \n" 47 "vmov.32 %3, d0[0] \n" 48 49 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) 50 : 51 : "cc", "q0", "q1", "q2", "q3", "q4"); 55 52 return diff; 56 53 } … … 58 55 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { 59 56 uint32 sse; 60 asm volatile 61 "vmov.u8 q8, #0 \n"62 "vmov.u8 q10, #0 \n"63 "vmov.u8 q9, #0 \n"64 "vmov.u8 q11, #0 \n"57 asm volatile( 58 "vmov.u8 q8, #0 \n" 59 "vmov.u8 q10, #0 \n" 60 "vmov.u8 q9, #0 \n" 61 "vmov.u8 q11, #0 \n" 65 62 66 "1:\n"67 "vld1.8 {q0}, [%0]! \n"68 "vld1.8 {q1}, [%1]! \n"69 "subs %2, %2, #16 \n"70 "vsubl.u8 q2, d0, d2 \n"71 "vsubl.u8 q3, d1, d3 \n"72 "vmlal.s16 q8, d4, d4 \n"73 "vmlal.s16 q9, d6, d6 \n"74 "vmlal.s16 q10, d5, d5 \n"75 "vmlal.s16 q11, d7, d7 \n"76 "bgt 1b \n"63 "1: \n" 64 "vld1.8 {q0}, [%0]! \n" 65 "vld1.8 {q1}, [%1]! \n" 66 "subs %2, %2, #16 \n" 67 "vsubl.u8 q2, d0, d2 \n" 68 "vsubl.u8 q3, d1, d3 \n" 69 "vmlal.s16 q8, d4, d4 \n" 70 "vmlal.s16 q9, d6, d6 \n" 71 "vmlal.s16 q10, d5, d5 \n" 72 "vmlal.s16 q11, d7, d7 \n" 73 "bgt 1b \n" 77 74 78 "vadd.u32 q8, q8, q9 \n" 79 "vadd.u32 q10, q10, q11 \n" 80 "vadd.u32 q11, q8, q10 \n" 81 "vpaddl.u32 q1, q11 \n" 82 "vadd.u64 d0, d2, d3 \n" 83 "vmov.32 %3, d0[0] \n" 84 : "+r"(src_a), 85 "+r"(src_b), 86 "+r"(count), 87 "=r"(sse) 88 : 89 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); 75 "vadd.u32 q8, q8, q9 \n" 76 "vadd.u32 q10, q10, q11 \n" 77 "vadd.u32 q11, q8, q10 \n" 78 "vpaddl.u32 q1, q11 \n" 79 "vadd.u64 d0, d2, d3 \n" 80 "vmov.32 %3, d0[0] \n" 81 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) 82 : 83 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); 90 84 return sse; 91 85 } -
pjproject/trunk/third_party/yuv/source/compare_neon64.cc
r5633 r5699 25 25 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { 26 26 uint32 diff; 27 asm volatile 28 "movi v4.8h, #0 \n"27 asm volatile( 28 "movi v4.8h, #0 \n" 29 29 30 "1:\n"31 "ld1 {v0.16b, v1.16b}, [%0], #32 \n"32 "ld1 {v2.16b, v3.16b}, [%1], #32 \n"33 "eor v0.16b, v0.16b, v2.16b \n"34 "eor v1.16b, v1.16b, v3.16b \n"35 "cnt v0.16b, v0.16b \n"36 "cnt v1.16b, v1.16b \n"37 "subs %w2, %w2, #32 \n"38 "add v0.16b, v0.16b, v1.16b \n"39 "uadalp v4.8h, v0.16b \n"40 "b.gt 1b \n"30 "1: \n" 31 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" 32 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" 33 "eor v0.16b, v0.16b, v2.16b \n" 34 "eor v1.16b, v1.16b, v3.16b \n" 35 "cnt v0.16b, v0.16b \n" 36 "cnt v1.16b, v1.16b \n" 37 "subs %w2, %w2, #32 \n" 38 "add v0.16b, v0.16b, v1.16b \n" 39 "uadalp v4.8h, v0.16b \n" 40 "b.gt 1b \n" 41 41 42 "uaddlv s4, v4.8h \n" 43 "fmov %w3, s4 \n" 44 : "+r"(src_a), 45 "+r"(src_b), 46 "+r"(count), 47 "=r"(diff) 48 : 49 : "cc", "v0", "v1", "v2", "v3", "v4"); 42 "uaddlv s4, v4.8h \n" 43 "fmov %w3, s4 \n" 44 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) 45 : 46 : "cc", "v0", "v1", "v2", "v3", "v4"); 50 47 return diff; 51 48 } … … 53 50 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { 54 51 uint32 sse; 55 asm volatile 56 "eor v16.16b, v16.16b, v16.16b \n"57 "eor v18.16b, v18.16b, v18.16b \n"58 "eor v17.16b, v17.16b, v17.16b \n"59 "eor v19.16b, v19.16b, v19.16b \n"52 asm volatile( 53 "eor v16.16b, v16.16b, v16.16b \n" 54 "eor v18.16b, v18.16b, v18.16b \n" 55 "eor v17.16b, v17.16b, v17.16b \n" 56 "eor v19.16b, v19.16b, v19.16b \n" 60 57 61 "1:\n"62 "ld1 {v0.16b}, [%0], #16 \n"63 "ld1 {v1.16b}, [%1], #16 \n"64 "subs %w2, %w2, #16 \n"65 "usubl v2.8h, v0.8b, v1.8b \n"66 "usubl2 v3.8h, v0.16b, v1.16b \n"67 "smlal v16.4s, v2.4h, v2.4h \n"68 "smlal v17.4s, v3.4h, v3.4h \n"69 "smlal2 v18.4s, v2.8h, v2.8h \n"70 "smlal2 v19.4s, v3.8h, v3.8h \n"71 "b.gt 1b \n"58 "1: \n" 59 "ld1 {v0.16b}, [%0], #16 \n" 60 "ld1 {v1.16b}, [%1], #16 \n" 61 "subs %w2, %w2, #16 \n" 62 "usubl v2.8h, v0.8b, v1.8b \n" 63 "usubl2 v3.8h, v0.16b, v1.16b \n" 64 "smlal v16.4s, v2.4h, v2.4h \n" 65 "smlal v17.4s, v3.4h, v3.4h \n" 66 "smlal2 v18.4s, v2.8h, v2.8h \n" 67 "smlal2 v19.4s, v3.8h, v3.8h \n" 68 "b.gt 1b \n" 72 69 73 "add v16.4s, v16.4s, v17.4s \n" 74 "add v18.4s, v18.4s, v19.4s \n" 75 "add v19.4s, v16.4s, v18.4s \n" 76 "addv s0, v19.4s \n" 77 "fmov %w3, s0 \n" 78 : "+r"(src_a), 79 "+r"(src_b), 80 "+r"(count), 81 "=r"(sse) 82 : 83 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 70 "add v16.4s, v16.4s, v17.4s \n" 71 "add v18.4s, v18.4s, v19.4s \n" 72 "add v19.4s, v16.4s, v18.4s \n" 73 "addv s0, v19.4s \n" 74 "fmov %w3, s0 \n" 75 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) 76 : 77 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 84 78 return sse; 85 79 } -
pjproject/trunk/third_party/yuv/source/compare_win.cc
r5358 r5699 10 10 11 11 #include "libyuv/basic_types.h" 12 13 #include "libyuv/compare_row.h" 12 14 #include "libyuv/row.h" 15 16 #if defined(_MSC_VER) 17 #include <intrin.h> // For __popcnt 18 #endif 13 19 14 20 #ifdef __cplusplus … … 17 23 #endif 18 24 25 // This module is for 32 bit Visual C x86 and clangcl 19 26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 #if (_MSC_VER >= 1900) 21 __declspec(naked) 22 #else 23 __declspec(naked) __declspec(align(16)) 24 #endif 25 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 26 __asm { 27 mov eax, [esp + 4] // src_a 28 mov edx, [esp + 8] // src_b 29 mov ecx, [esp + 12] // count 27 28 uint32 HammingDistance_SSE42(const uint8* src_a, 29 const uint8* src_b, 30 int count) { 31 uint32 diff = 0u; 32 33 int i; 34 for (i = 0; i < count - 3; i += 4) { 35 uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b); 36 src_a += 4; 37 src_b += 4; 38 diff += __popcnt(x); 39 } 40 return diff; 41 } 42 43 __declspec(naked) uint32 44 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 45 __asm { 46 mov eax, [esp + 4] // src_a 47 mov edx, [esp + 8] // src_b 48 mov ecx, [esp + 12] // count 30 49 pxor xmm0, xmm0 31 50 pxor xmm5, xmm5 32 51 33 align 4 34 wloop: 35 movdqa xmm1, [eax] 52 wloop: 53 movdqu xmm1, [eax] 36 54 lea eax, [eax + 16] 37 movdq axmm2, [edx]55 movdqu xmm2, [edx] 38 56 lea edx, [edx + 16] 39 sub ecx, 1640 57 movdqa xmm3, xmm1 // abs trick 41 58 psubusb xmm1, xmm2 … … 49 66 paddd xmm0, xmm1 50 67 paddd xmm0, xmm2 68 sub ecx, 16 51 69 jg wloop 52 70 … … 63 81 #if _MSC_VER >= 1700 64 82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 65 #pragma warning(disable: 4752) 66 #if (_MSC_VER >= 1900) 67 __declspec(naked) 68 #else 69 __declspec(naked) __declspec(align(16)) 70 #endif 71 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 72 __asm { 73 mov eax, [esp + 4] // src_a 74 mov edx, [esp + 8] // src_b 75 mov ecx, [esp + 12] // count 83 #pragma warning(disable : 4752) 84 __declspec(naked) uint32 85 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 86 __asm { 87 mov eax, [esp + 4] // src_a 88 mov edx, [esp + 8] // src_b 89 mov ecx, [esp + 12] // count 76 90 vpxor ymm0, ymm0, ymm0 // sum 77 91 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 78 92 sub edx, eax 79 93 80 align 481 94 wloop: 82 95 vmovdqu ymm1, [eax] 83 96 vmovdqu ymm2, [eax + edx] 84 97 lea eax, [eax + 32] 85 sub ecx, 3286 98 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 87 99 vpsubusb ymm2, ymm2, ymm1 … … 93 105 vpaddd ymm0, ymm0, ymm1 94 106 vpaddd ymm0, ymm0, ymm2 107 sub ecx, 32 95 108 jg wloop 96 109 … … 108 121 #endif // _MSC_VER >= 1700 109 122 110 #define HAS_HASHDJB2_SSE41 111 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 112 static uvec32 kHashMul0 = { 113 0x0c3525e1, // 33 ^ 15 114 0xa3476dc1, // 33 ^ 14 115 0x3b4039a1, // 33 ^ 13 116 0x4f5f0981, // 33 ^ 12 117 }; 118 static uvec32 kHashMul1 = { 119 0x30f35d61, // 33 ^ 11 120 0x855cb541, // 33 ^ 10 121 0x040a9121, // 33 ^ 9 122 0x747c7101, // 33 ^ 8 123 }; 124 static uvec32 kHashMul2 = { 125 0xec41d4e1, // 33 ^ 7 126 0x4cfa3cc1, // 33 ^ 6 127 0x025528a1, // 33 ^ 5 128 0x00121881, // 33 ^ 4 129 }; 130 static uvec32 kHashMul3 = { 131 0x00008c61, // 33 ^ 3 132 0x00000441, // 33 ^ 2 133 0x00000021, // 33 ^ 1 134 0x00000001, // 33 ^ 0 135 }; 136 137 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 138 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 139 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 140 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 141 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 142 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 143 _asm _emit 0x40 _asm _emit reg 144 145 #if (_MSC_VER >= 1900) 146 __declspec(naked) 147 #else 148 __declspec(naked) __declspec(align(16)) 149 #endif 150 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 124 uvec32 kHashMul0 = { 125 0x0c3525e1, // 33 ^ 15 126 0xa3476dc1, // 33 ^ 14 127 0x3b4039a1, // 33 ^ 13 128 0x4f5f0981, // 33 ^ 12 129 }; 130 uvec32 kHashMul1 = { 131 0x30f35d61, // 33 ^ 11 132 0x855cb541, // 33 ^ 10 133 0x040a9121, // 33 ^ 9 134 0x747c7101, // 33 ^ 8 135 }; 136 uvec32 kHashMul2 = { 137 0xec41d4e1, // 33 ^ 7 138 0x4cfa3cc1, // 33 ^ 6 139 0x025528a1, // 33 ^ 5 140 0x00121881, // 33 ^ 4 141 }; 142 uvec32 kHashMul3 = { 143 0x00008c61, // 33 ^ 3 144 0x00000441, // 33 ^ 2 145 0x00000021, // 33 ^ 1 146 0x00000001, // 33 ^ 0 147 }; 148 149 __declspec(naked) uint32 150 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 154 154 movd xmm0, [esp + 12] // seed 155 155 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, kHash16x33 158 159 align 4 160 wloop: 161 movdqu xmm1, [eax] // src[0-15] 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, xmmword ptr kHash16x33 158 159 wloop: 160 movdqu xmm1, [eax] // src[0-15] 162 161 lea eax, [eax + 16] 163 pmulld (0xc6) // pmulld xmm0,xmm6hash *= 33 ^ 16164 movdqa xmm5, kHashMul0162 pmulld xmm0, xmm6 // hash *= 33 ^ 16 163 movdqa xmm5, xmmword ptr kHashMul0 165 164 movdqa xmm2, xmm1 166 punpcklbw xmm2, xmm7 165 punpcklbw xmm2, xmm7 // src[0-7] 167 166 movdqa xmm3, xmm2 168 punpcklwd xmm3, xmm7 169 pmulld (0xdd) // pmulldxmm3, xmm5170 movdqa xmm5, kHashMul1167 punpcklwd xmm3, xmm7 // src[0-3] 168 pmulld xmm3, xmm5 169 movdqa xmm5, xmmword ptr kHashMul1 171 170 movdqa xmm4, xmm2 172 punpckhwd xmm4, xmm7 173 pmulld (0xe5) // pmulldxmm4, xmm5174 movdqa xmm5, kHashMul2175 punpckhbw xmm1, xmm7 171 punpckhwd xmm4, xmm7 // src[4-7] 172 pmulld xmm4, xmm5 173 movdqa xmm5, xmmword ptr kHashMul2 174 punpckhbw xmm1, xmm7 // src[8-15] 176 175 movdqa xmm2, xmm1 177 punpcklwd xmm2, xmm7 178 pmulld (0xd5) // pmulldxmm2, xmm5179 movdqa xmm5, kHashMul3180 punpckhwd xmm1, xmm7 181 pmulld (0xcd) // pmulldxmm1, xmm5182 paddd xmm3, xmm4 176 punpcklwd xmm2, xmm7 // src[8-11] 177 pmulld xmm2, xmm5 178 movdqa xmm5, xmmword ptr kHashMul3 179 punpckhwd xmm1, xmm7 // src[12-15] 180 pmulld xmm1, xmm5 181 paddd xmm3, xmm4 // add 16 results 183 182 paddd xmm1, xmm2 184 sub ecx, 16185 183 paddd xmm1, xmm3 186 184 … … 190 188 paddd xmm1, xmm2 191 189 paddd xmm0, xmm1 192 jg wloop 193 194 movd eax, xmm0 // return hash 190 sub ecx, 16 191 jg wloop 192 193 movd eax, xmm0 // return hash 195 194 ret 196 195 } … … 199 198 // Visual C 2012 required for AVX2. 200 199 #if _MSC_VER >= 1700 201 #if (_MSC_VER >= 1900) 202 __declspec(naked) 203 #else 204 __declspec(naked) __declspec(align(16)) 205 #endif 206 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 207 __asm { 208 mov eax, [esp + 4] // src 209 mov ecx, [esp + 8] // count 210 movd xmm0, [esp + 12] // seed 211 movdqa xmm6, kHash16x33 212 213 align 4 214 wloop: 215 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] 216 pmulld xmm0, xmm6 // hash *= 33 ^ 16 217 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] 218 pmulld xmm3, kHashMul0 219 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] 220 pmulld xmm4, kHashMul1 221 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] 222 pmulld xmm2, kHashMul2 200 __declspec(naked) uint32 201 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 202 __asm { 203 mov eax, [esp + 4] // src 204 mov ecx, [esp + 8] // count 205 vmovd xmm0, [esp + 12] // seed 206 207 wloop: 208 vpmovzxbd xmm3, [eax] // src[0-3] 209 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 210 vpmovzxbd xmm4, [eax + 4] // src[4-7] 211 vpmulld xmm3, xmm3, xmmword ptr kHashMul0 212 vpmovzxbd xmm2, [eax + 8] // src[8-11] 213 vpmulld xmm4, xmm4, xmmword ptr kHashMul1 214 vpmovzxbd xmm1, [eax + 12] // src[12-15] 215 vpmulld xmm2, xmm2, xmmword ptr kHashMul2 223 216 lea eax, [eax + 16] 224 pmulld xmm1, kHashMul3 225 paddd xmm3, xmm4 // add 16 results 226 paddd xmm1, xmm2 217 vpmulld xmm1, xmm1, xmmword ptr kHashMul3 218 vpaddd xmm3, xmm3, xmm4 // add 16 results 219 vpaddd xmm1, xmm1, xmm2 220 vpaddd xmm1, xmm1, xmm3 221 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords 222 vpaddd xmm1, xmm1,xmm2 223 vpshufd xmm2, xmm1, 0x01 224 vpaddd xmm1, xmm1, xmm2 225 vpaddd xmm0, xmm0, xmm1 227 226 sub ecx, 16 228 paddd xmm1, xmm3 229 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 230 paddd xmm1, xmm2 231 pshufd xmm2, xmm1, 0x01 232 paddd xmm1, xmm2 233 paddd xmm0, xmm1 234 jg wloop 235 236 movd eax, xmm0 // return hash 227 jg wloop 228 229 vmovd eax, xmm0 // return hash 230 vzeroupper 237 231 ret 238 232 } … … 240 234 #endif // _MSC_VER >= 1700 241 235 242 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)236 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 243 237 244 238 #ifdef __cplusplus -
pjproject/trunk/third_party/yuv/source/convert_from.cc
r5633 r5699 658 658 } 659 659 660 // Convert H420 to RGB24. 661 LIBYUV_API 662 int H420ToRGB24(const uint8* src_y, 663 int src_stride_y, 664 const uint8* src_u, 665 int src_stride_u, 666 const uint8* src_v, 667 int src_stride_v, 668 uint8* dst_rgb24, 669 int dst_stride_rgb24, 670 int width, 671 int height) { 672 return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, 673 src_stride_v, dst_rgb24, dst_stride_rgb24, 674 &kYuvH709Constants, width, height); 675 } 676 677 // Convert H420 to RAW. 678 LIBYUV_API 679 int H420ToRAW(const uint8* src_y, 680 int src_stride_y, 681 const uint8* src_u, 682 int src_stride_u, 683 const uint8* src_v, 684 int src_stride_v, 685 uint8* dst_raw, 686 int dst_stride_raw, 687 int width, 688 int height) { 689 return I420ToRGB24Matrix(src_y, src_stride_y, src_v, 690 src_stride_v, // Swap U and V 691 src_u, src_stride_u, dst_raw, dst_stride_raw, 692 &kYvuH709Constants, // Use Yvu matrix 693 width, height); 694 } 695 660 696 // Convert I420 to ARGB1555. 661 697 LIBYUV_API … … 1076 1112 I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); 1077 1113 ARGBToRGB565DitherRow(row_argb, dst_rgb565, 1078 *(uint32*)(dither4x4 + ((y & 3) << 2)), 1079 width); // NOLINT1114 *(uint32*)(dither4x4 + ((y & 3) << 2)), // NOLINT 1115 width); // NOLINT 1080 1116 dst_rgb565 += dst_stride_rgb565; 1081 1117 src_y += src_stride_y; -
pjproject/trunk/third_party/yuv/source/cpu_id.cc
r5633 r5699 125 125 int xcr0 = 0; 126 126 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) 127 xcr0 = _xgetbv(0); // VS2010 SP1 required.127 xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT 128 128 #elif defined(__i386__) || defined(__x86_64__) 129 129 asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); … … 243 243 // Detect AVX512bw 244 244 if ((GetXCR0() & 0xe0) == 0xe0) { 245 cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; 246 } 247 } 248 245 cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; 246 cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; 247 cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; 248 cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; 249 cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; 250 cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; 251 cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; 252 } 253 } 254 255 // TODO(fbarchard): Consider moving these to gtest 249 256 // Environment variable overrides for testing. 250 257 if (TestEnv("LIBYUV_DISABLE_X86")) { … … 275 282 cpu_info &= ~kCpuHasFMA3; 276 283 } 277 if (TestEnv("LIBYUV_DISABLE_AVX3")) {278 cpu_info &= ~kCpuHasAVX3;279 }280 284 if (TestEnv("LIBYUV_DISABLE_F16C")) { 281 285 cpu_info &= ~kCpuHasF16C; 286 } 287 if (TestEnv("LIBYUV_DISABLE_AVX512BW")) { 288 cpu_info &= ~kCpuHasAVX512BW; 282 289 } 283 290 -
pjproject/trunk/third_party/yuv/source/mjpeg_decoder.cc
r5633 r5699 13 13 #ifdef HAVE_JPEG 14 14 #include <assert.h> 15 16 #ifdef __cplusplus17 #include <new>18 #endif19 15 20 16 #if !defined(__pnacl__) && !defined(__CLR_VER) && \ -
pjproject/trunk/third_party/yuv/source/mjpeg_validate.cc
r5633 r5699 25 25 while (it < end) { 26 26 // TODO(fbarchard): scan for 0xd9 instead. 27 it = static_cast<const uint8*>(memchr(it, 0xff, end - it));27 it = (const uint8*)(memchr(it, 0xff, end - it)); 28 28 if (it == NULL) { 29 29 break; -
pjproject/trunk/third_party/yuv/source/planar_functions.cc
r5633 r5699 322 322 } 323 323 #endif 324 #if defined(HAS_SPLITUVROW_MSA) 325 if (TestCpuFlag(kCpuHasMSA)) { 326 SplitUVRow = SplitUVRow_Any_MSA; 327 if (IS_ALIGNED(width, 32)) { 328 SplitUVRow = SplitUVRow_MSA; 329 } 330 } 331 #endif 324 332 325 333 for (y = 0; y < height; ++y) { … … 397 405 src_v += src_stride_v; 398 406 dst_uv += dst_stride_uv; 407 } 408 } 409 410 // Support function for NV12 etc RGB channels. 411 // Width and height are plane sizes (typically half pixel width). 412 LIBYUV_API 413 void SplitRGBPlane(const uint8* src_rgb, 414 int src_stride_rgb, 415 uint8* dst_r, 416 int dst_stride_r, 417 uint8* dst_g, 418 int dst_stride_g, 419 uint8* dst_b, 420 int dst_stride_b, 421 int width, 422 int height) { 423 int y; 424 void (*SplitRGBRow)(const uint8* src_rgb, uint8* dst_r, uint8* dst_g, 425 uint8* dst_b, int width) = SplitRGBRow_C; 426 // Negative height means invert the image. 427 if (height < 0) { 428 height = -height; 429 dst_r = dst_r + (height - 1) * dst_stride_r; 430 dst_g = dst_g + (height - 1) * dst_stride_g; 431 dst_b = dst_b + (height - 1) * dst_stride_b; 432 dst_stride_r = -dst_stride_r; 433 dst_stride_g = -dst_stride_g; 434 dst_stride_b = -dst_stride_b; 435 } 436 // Coalesce rows. 437 if (src_stride_rgb == width * 3 && dst_stride_r == width && 438 dst_stride_g == width && dst_stride_b == width) { 439 width *= height; 440 height = 1; 441 src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; 442 } 443 #if defined(HAS_SPLITRGBROW_SSSE3) 444 if (TestCpuFlag(kCpuHasSSSE3)) { 445 SplitRGBRow = SplitRGBRow_Any_SSSE3; 446 if (IS_ALIGNED(width, 16)) { 447 SplitRGBRow = SplitRGBRow_SSSE3; 448 } 449 } 450 #endif 451 #if defined(HAS_SPLITRGBROW_NEON) 452 if (TestCpuFlag(kCpuHasNEON)) { 453 SplitRGBRow = SplitRGBRow_Any_NEON; 454 if (IS_ALIGNED(width, 16)) { 455 SplitRGBRow = SplitRGBRow_NEON; 456 } 457 } 458 #endif 459 460 for (y = 0; y < height; ++y) { 461 // Copy a row of RGB. 462 SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); 463 dst_r += dst_stride_r; 464 dst_g += dst_stride_g; 465 dst_b += dst_stride_b; 466 src_rgb += src_stride_rgb; 467 } 468 } 469 470 LIBYUV_API 471 void MergeRGBPlane(const uint8* src_r, 472 int src_stride_r, 473 const uint8* src_g, 474 int src_stride_g, 475 const uint8* src_b, 476 int src_stride_b, 477 uint8* dst_rgb, 478 int dst_stride_rgb, 479 int width, 480 int height) { 481 int y; 482 void (*MergeRGBRow)(const uint8* src_r, const uint8* src_g, 483 const uint8* src_b, uint8* dst_rgb, int width) = 484 MergeRGBRow_C; 485 // Coalesce rows. 486 // Negative height means invert the image. 487 if (height < 0) { 488 height = -height; 489 dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; 490 dst_stride_rgb = -dst_stride_rgb; 491 } 492 // Coalesce rows. 493 if (src_stride_r == width && src_stride_g == width && src_stride_b == width && 494 dst_stride_rgb == width * 3) { 495 width *= height; 496 height = 1; 497 src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; 498 } 499 #if defined(HAS_MERGERGBROW_SSSE3) 500 if (TestCpuFlag(kCpuHasSSSE3)) { 501 MergeRGBRow = MergeRGBRow_Any_SSSE3; 502 if (IS_ALIGNED(width, 16)) { 503 MergeRGBRow = MergeRGBRow_SSSE3; 504 } 505 } 506 #endif 507 #if defined(HAS_MERGERGBROW_NEON) 508 if (TestCpuFlag(kCpuHasNEON)) { 509 MergeRGBRow = MergeRGBRow_Any_NEON; 510 if (IS_ALIGNED(width, 16)) { 511 MergeRGBRow = MergeRGBRow_NEON; 512 } 513 } 514 #endif 515 516 for (y = 0; y < height; ++y) { 517 // Merge a row of U and V into a row of RGB. 518 MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); 519 src_r += src_stride_r; 520 src_g += src_stride_g; 521 src_b += src_stride_b; 522 dst_rgb += dst_stride_rgb; 399 523 } 400 524 } … … 845 969 if (TestCpuFlag(kCpuHasNEON)) { 846 970 ARGBBlendRow = ARGBBlendRow_NEON; 971 } 972 #endif 973 #if defined(HAS_ARGBBLENDROW_MSA) 974 if (TestCpuFlag(kCpuHasMSA)) { 975 ARGBBlendRow = ARGBBlendRow_MSA; 847 976 } 848 977 #endif … … 1575 1704 } 1576 1705 #endif 1706 #if defined(HAS_SETROW_MSA) 1707 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { 1708 SetRow = SetRow_MSA; 1709 } 1710 #endif 1577 1711 1578 1712 // Set plane … … 1975 2109 } 1976 2110 #endif 2111 #if defined(HAS_ARGBCOLORMATRIXROW_MSA) 2112 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { 2113 ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; 2114 } 2115 #endif 1977 2116 for (y = 0; y < height; ++y) { 1978 2117 ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); … … 2133 2272 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { 2134 2273 ARGBQuantizeRow = ARGBQuantizeRow_NEON; 2274 } 2275 #endif 2276 #if defined(HAS_ARGBQUANTIZEROW_MSA) 2277 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { 2278 ARGBQuantizeRow = ARGBQuantizeRow_MSA; 2135 2279 } 2136 2280 #endif … … 2620 2764 } 2621 2765 #endif 2766 #if defined(HAS_SOBELYROW_MSA) 2767 if (TestCpuFlag(kCpuHasMSA)) { 2768 SobelYRow = SobelYRow_MSA; 2769 } 2770 #endif 2622 2771 #if defined(HAS_SOBELXROW_SSE2) 2623 2772 if (TestCpuFlag(kCpuHasSSE2)) { … … 2628 2777 if (TestCpuFlag(kCpuHasNEON)) { 2629 2778 SobelXRow = SobelXRow_NEON; 2779 } 2780 #endif 2781 #if defined(HAS_SOBELXROW_MSA) 2782 if (TestCpuFlag(kCpuHasMSA)) { 2783 SobelXRow = SobelXRow_MSA; 2630 2784 } 2631 2785 #endif … … 2904 3058 } 2905 3059 #endif 3060 #if defined(HAS_HALFFLOATROW_MSA) 3061 if (TestCpuFlag(kCpuHasMSA)) { 3062 HalfFloatRow = HalfFloatRow_Any_MSA; 3063 if (IS_ALIGNED(width, 32)) { 3064 HalfFloatRow = HalfFloatRow_MSA; 3065 } 3066 } 3067 #endif 2906 3068 2907 3069 for (y = 0; y < height; ++y) { … … 3047 3209 ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON 3048 3210 : ARGBExtractAlphaRow_Any_NEON; 3211 } 3212 #endif 3213 #if defined(HAS_ARGBEXTRACTALPHAROW_MSA) 3214 if (TestCpuFlag(kCpuHasMSA)) { 3215 ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA 3216 : ARGBExtractAlphaRow_Any_MSA; 3049 3217 } 3050 3218 #endif … … 3158 3326 if (IS_ALIGNED(width, 16)) { 3159 3327 SplitUVRow = SplitUVRow_NEON; 3328 } 3329 } 3330 #endif 3331 #if defined(HAS_SPLITUVROW_MSA) 3332 if (TestCpuFlag(kCpuHasMSA)) { 3333 SplitUVRow = SplitUVRow_Any_MSA; 3334 if (IS_ALIGNED(width, 32)) { 3335 SplitUVRow = SplitUVRow_MSA; 3160 3336 } 3161 3337 } … … 3269 3445 } 3270 3446 #endif 3447 #if defined(HAS_SPLITUVROW_MSA) 3448 if (TestCpuFlag(kCpuHasMSA)) { 3449 SplitUVRow = SplitUVRow_Any_MSA; 3450 if (IS_ALIGNED(width, 32)) { 3451 SplitUVRow = SplitUVRow_MSA; 3452 } 3453 } 3454 #endif 3271 3455 #if defined(HAS_INTERPOLATEROW_SSSE3) 3272 3456 if (TestCpuFlag(kCpuHasSSSE3)) { -
pjproject/trunk/third_party/yuv/source/rotate.cc
r5633 r5699 360 360 IS_ALIGNED(src_stride, 4)) { 361 361 MirrorUVRow = MirrorUVRow_DSPR2; 362 } 363 #endif 364 #if defined(HAS_MIRRORUVROW_MSA) 365 if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { 366 MirrorUVRow = MirrorUVRow_MSA; 362 367 } 363 368 #endif -
pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
r5633 r5699 31 31 int width) { 32 32 const uint8* src_temp; 33 asm volatile 34 // loops are on blocks of 8. loop will stop when35 // counter gets to or below 0. starting the counter36 // at w-8 allow for this37 "sub %w3, %w3, #8 \n"38 39 // handle 8x8 blocks. this should be the majority of the plane40 "1: \n"33 asm volatile( 34 // loops are on blocks of 8. loop will stop when 35 // counter gets to or below 0. starting the counter 36 // at w-8 allow for this 37 "sub %w3, %w3, #8 \n" 38 39 // handle 8x8 blocks. this should be the majority of the plane 40 "1: \n" 41 41 "mov %0, %1 \n" 42 42 … … 93 93 "b.ge 1b \n" 94 94 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23" 197 ); 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23"); 198 197 } 199 198 … … 210 209 int width) { 211 210 const uint8* src_temp; 212 asm volatile ( 213 // loops are on blocks of 8. loop will stop when 214 // counter gets to or below 0. starting the counter 215 // at w-8 allow for this 216 "sub %w4, %w4, #8 \n" 217 218 // handle 8x8 blocks. this should be the majority of the plane 219 "1: \n" 220 "mov %0, %1 \n" 221 222 "ld1 {v0.16b}, [%0], %5 \n" 223 "ld1 {v1.16b}, [%0], %5 \n" 224 "ld1 {v2.16b}, [%0], %5 \n" 225 "ld1 {v3.16b}, [%0], %5 \n" 226 "ld1 {v4.16b}, [%0], %5 \n" 227 "ld1 {v5.16b}, [%0], %5 \n" 228 "ld1 {v6.16b}, [%0], %5 \n" 229 "ld1 {v7.16b}, [%0] \n" 230 231 "trn1 v16.16b, v0.16b, v1.16b \n" 232 "trn2 v17.16b, v0.16b, v1.16b \n" 233 "trn1 v18.16b, v2.16b, v3.16b \n" 234 "trn2 v19.16b, v2.16b, v3.16b \n" 235 "trn1 v20.16b, v4.16b, v5.16b \n" 236 "trn2 v21.16b, v4.16b, v5.16b \n" 237 "trn1 v22.16b, v6.16b, v7.16b \n" 238 "trn2 v23.16b, v6.16b, v7.16b \n" 239 240 "trn1 v0.8h, v16.8h, v18.8h \n" 241 "trn2 v1.8h, v16.8h, v18.8h \n" 242 "trn1 v2.8h, v20.8h, v22.8h \n" 243 "trn2 v3.8h, v20.8h, v22.8h \n" 244 "trn1 v4.8h, v17.8h, v19.8h \n" 245 "trn2 v5.8h, v17.8h, v19.8h \n" 246 "trn1 v6.8h, v21.8h, v23.8h \n" 247 "trn2 v7.8h, v21.8h, v23.8h \n" 248 249 "trn1 v16.4s, v0.4s, v2.4s \n" 250 "trn2 v17.4s, v0.4s, v2.4s \n" 251 "trn1 v18.4s, v1.4s, v3.4s \n" 252 "trn2 v19.4s, v1.4s, v3.4s \n" 253 "trn1 v20.4s, v4.4s, v6.4s \n" 254 "trn2 v21.4s, v4.4s, v6.4s \n" 255 "trn1 v22.4s, v5.4s, v7.4s \n" 256 "trn2 v23.4s, v5.4s, v7.4s \n" 257 258 "mov %0, %2 \n" 259 260 "st1 {v16.d}[0], [%0], %6 \n" 261 "st1 {v18.d}[0], [%0], %6 \n" 262 "st1 {v17.d}[0], [%0], %6 \n" 263 "st1 {v19.d}[0], [%0], %6 \n" 264 "st1 {v16.d}[1], [%0], %6 \n" 265 "st1 {v18.d}[1], [%0], %6 \n" 266 "st1 {v17.d}[1], [%0], %6 \n" 267 "st1 {v19.d}[1], [%0] \n" 268 269 "mov %0, %3 \n" 270 271 "st1 {v20.d}[0], [%0], %7 \n" 272 "st1 {v22.d}[0], [%0], %7 \n" 273 "st1 {v21.d}[0], [%0], %7 \n" 274 "st1 {v23.d}[0], [%0], %7 \n" 275 "st1 {v20.d}[1], [%0], %7 \n" 276 "st1 {v22.d}[1], [%0], %7 \n" 277 "st1 {v21.d}[1], [%0], %7 \n" 278 "st1 {v23.d}[1], [%0] \n" 279 280 "add %1, %1, #16 \n" // src += 8*2 281 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b 283 "subs %w4, %w4, #8 \n" // w -= 8 284 "b.ge 1b \n" 285 286 // add 8 back to counter. if the result is 0 there are 287 // no residuals. 288 "adds %w4, %w4, #8 \n" 289 "b.eq 4f \n" 290 291 // some residual, so between 1 and 7 lines left to transpose 292 "cmp %w4, #2 \n" 293 "b.lt 3f \n" 294 295 "cmp %w4, #4 \n" 296 "b.lt 2f \n" 297 298 // TODO(frkoenig): Clean this up 299 // 4x8 block 300 "mov %0, %1 \n" 301 "ld1 {v0.8b}, [%0], %5 \n" 302 "ld1 {v1.8b}, [%0], %5 \n" 303 "ld1 {v2.8b}, [%0], %5 \n" 304 "ld1 {v3.8b}, [%0], %5 \n" 305 "ld1 {v4.8b}, [%0], %5 \n" 306 "ld1 {v5.8b}, [%0], %5 \n" 307 "ld1 {v6.8b}, [%0], %5 \n" 308 "ld1 {v7.8b}, [%0] \n" 309 310 "ld1 {v30.16b}, [%8], #16 \n" 311 "ld1 {v31.16b}, [%8] \n" 312 313 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 314 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 315 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 316 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 317 318 "mov %0, %2 \n" 319 320 "st1 {v16.s}[0], [%0], %6 \n" 321 "st1 {v16.s}[1], [%0], %6 \n" 322 "st1 {v16.s}[2], [%0], %6 \n" 323 "st1 {v16.s}[3], [%0], %6 \n" 324 325 "add %0, %2, #4 \n" 326 "st1 {v18.s}[0], [%0], %6 \n" 327 "st1 {v18.s}[1], [%0], %6 \n" 328 "st1 {v18.s}[2], [%0], %6 \n" 329 "st1 {v18.s}[3], [%0] \n" 330 331 "mov %0, %3 \n" 332 333 "st1 {v17.s}[0], [%0], %7 \n" 334 "st1 {v17.s}[1], [%0], %7 \n" 335 "st1 {v17.s}[2], [%0], %7 \n" 336 "st1 {v17.s}[3], [%0], %7 \n" 337 338 "add %0, %3, #4 \n" 339 "st1 {v19.s}[0], [%0], %7 \n" 340 "st1 {v19.s}[1], [%0], %7 \n" 341 "st1 {v19.s}[2], [%0], %7 \n" 342 "st1 {v19.s}[3], [%0] \n" 343 344 "add %1, %1, #8 \n" // src += 4 * 2 345 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a 346 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b 347 "subs %w4, %w4, #4 \n" // w -= 4 348 "b.eq 4f \n" 349 350 // some residual, check to see if it includes a 2x8 block, 351 // or less 352 "cmp %w4, #2 \n" 353 "b.lt 3f \n" 354 355 // 2x8 block 356 "2: \n" 357 "mov %0, %1 \n" 358 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 359 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 360 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 361 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 362 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 363 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 364 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 365 "ld2 {v2.h, v3.h}[3], [%0] \n" 366 367 "trn1 v4.8b, v0.8b, v2.8b \n" 368 "trn2 v5.8b, v0.8b, v2.8b \n" 369 "trn1 v6.8b, v1.8b, v3.8b \n" 370 "trn2 v7.8b, v1.8b, v3.8b \n" 371 372 "mov %0, %2 \n" 373 374 "st1 {v4.d}[0], [%0], %6 \n" 375 "st1 {v6.d}[0], [%0] \n" 376 377 "mov %0, %3 \n" 378 379 "st1 {v5.d}[0], [%0], %7 \n" 380 "st1 {v7.d}[0], [%0] \n" 381 382 "add %1, %1, #4 \n" // src += 2 * 2 383 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a 384 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b 385 "subs %w4, %w4, #2 \n" // w -= 2 386 "b.eq 4f \n" 387 388 // 1x8 block 389 "3: \n" 390 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 391 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 392 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 393 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 394 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 395 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[7], [%1] \n" 398 399 "st1 {v0.d}[0], [%2] \n" 400 "st1 {v1.d}[0], [%3] \n" 401 402 "4: \n" 403 404 : "=&r"(src_temp), // %0 405 "+r"(src), // %1 406 "+r"(dst_a), // %2 407 "+r"(dst_b), // %3 408 "+r"(width) // %4 409 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 410 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 411 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 412 "r"(&kVTbl4x4TransposeDi) // %8 413 : "memory", "cc", 414 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 415 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", 416 "v30", "v31" 417 ); 211 asm volatile( 212 // loops are on blocks of 8. loop will stop when 213 // counter gets to or below 0. starting the counter 214 // at w-8 allow for this 215 "sub %w4, %w4, #8 \n" 216 217 // handle 8x8 blocks. this should be the majority of the plane 218 "1: \n" 219 "mov %0, %1 \n" 220 221 "ld1 {v0.16b}, [%0], %5 \n" 222 "ld1 {v1.16b}, [%0], %5 \n" 223 "ld1 {v2.16b}, [%0], %5 \n" 224 "ld1 {v3.16b}, [%0], %5 \n" 225 "ld1 {v4.16b}, [%0], %5 \n" 226 "ld1 {v5.16b}, [%0], %5 \n" 227 "ld1 {v6.16b}, [%0], %5 \n" 228 "ld1 {v7.16b}, [%0] \n" 229 230 "trn1 v16.16b, v0.16b, v1.16b \n" 231 "trn2 v17.16b, v0.16b, v1.16b \n" 232 "trn1 v18.16b, v2.16b, v3.16b \n" 233 "trn2 v19.16b, v2.16b, v3.16b \n" 234 "trn1 v20.16b, v4.16b, v5.16b \n" 235 "trn2 v21.16b, v4.16b, v5.16b \n" 236 "trn1 v22.16b, v6.16b, v7.16b \n" 237 "trn2 v23.16b, v6.16b, v7.16b \n" 238 239 "trn1 v0.8h, v16.8h, v18.8h \n" 240 "trn2 v1.8h, v16.8h, v18.8h \n" 241 "trn1 v2.8h, v20.8h, v22.8h \n" 242 "trn2 v3.8h, v20.8h, v22.8h \n" 243 "trn1 v4.8h, v17.8h, v19.8h \n" 244 "trn2 v5.8h, v17.8h, v19.8h \n" 245 "trn1 v6.8h, v21.8h, v23.8h \n" 246 "trn2 v7.8h, v21.8h, v23.8h \n" 247 248 "trn1 v16.4s, v0.4s, v2.4s \n" 249 "trn2 v17.4s, v0.4s, v2.4s \n" 250 "trn1 v18.4s, v1.4s, v3.4s \n" 251 "trn2 v19.4s, v1.4s, v3.4s \n" 252 "trn1 v20.4s, v4.4s, v6.4s \n" 253 "trn2 v21.4s, v4.4s, v6.4s \n" 254 "trn1 v22.4s, v5.4s, v7.4s \n" 255 "trn2 v23.4s, v5.4s, v7.4s \n" 256 257 "mov %0, %2 \n" 258 259 "st1 {v16.d}[0], [%0], %6 \n" 260 "st1 {v18.d}[0], [%0], %6 \n" 261 "st1 {v17.d}[0], [%0], %6 \n" 262 "st1 {v19.d}[0], [%0], %6 \n" 263 "st1 {v16.d}[1], [%0], %6 \n" 264 "st1 {v18.d}[1], [%0], %6 \n" 265 "st1 {v17.d}[1], [%0], %6 \n" 266 "st1 {v19.d}[1], [%0] \n" 267 268 "mov %0, %3 \n" 269 270 "st1 {v20.d}[0], [%0], %7 \n" 271 "st1 {v22.d}[0], [%0], %7 \n" 272 "st1 {v21.d}[0], [%0], %7 \n" 273 "st1 {v23.d}[0], [%0], %7 \n" 274 "st1 {v20.d}[1], [%0], %7 \n" 275 "st1 {v22.d}[1], [%0], %7 \n" 276 "st1 {v21.d}[1], [%0], %7 \n" 277 "st1 {v23.d}[1], [%0] \n" 278 279 "add %1, %1, #16 \n" // src += 8*2 280 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * 281 // dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * 283 // dst_stride_b 284 "subs %w4, %w4, #8 \n" // w -= 8 285 "b.ge 1b \n" 286 287 // add 8 back to counter. if the result is 0 there are 288 // no residuals. 289 "adds %w4, %w4, #8 \n" 290 "b.eq 4f \n" 291 292 // some residual, so between 1 and 7 lines left to transpose 293 "cmp %w4, #2 \n" 294 "b.lt 3f \n" 295 296 "cmp %w4, #4 \n" 297 "b.lt 2f \n" 298 299 // TODO(frkoenig): Clean this up 300 // 4x8 block 301 "mov %0, %1 \n" 302 "ld1 {v0.8b}, [%0], %5 \n" 303 "ld1 {v1.8b}, [%0], %5 \n" 304 "ld1 {v2.8b}, [%0], %5 \n" 305 "ld1 {v3.8b}, [%0], %5 \n" 306 "ld1 {v4.8b}, [%0], %5 \n" 307 "ld1 {v5.8b}, [%0], %5 \n" 308 "ld1 {v6.8b}, [%0], %5 \n" 309 "ld1 {v7.8b}, [%0] \n" 310 311 "ld1 {v30.16b}, [%8], #16 \n" 312 "ld1 {v31.16b}, [%8] \n" 313 314 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 315 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 316 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 317 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 318 319 "mov %0, %2 \n" 320 321 "st1 {v16.s}[0], [%0], %6 \n" 322 "st1 {v16.s}[1], [%0], %6 \n" 323 "st1 {v16.s}[2], [%0], %6 \n" 324 "st1 {v16.s}[3], [%0], %6 \n" 325 326 "add %0, %2, #4 \n" 327 "st1 {v18.s}[0], [%0], %6 \n" 328 "st1 {v18.s}[1], [%0], %6 \n" 329 "st1 {v18.s}[2], [%0], %6 \n" 330 "st1 {v18.s}[3], [%0] \n" 331 332 "mov %0, %3 \n" 333 334 "st1 {v17.s}[0], [%0], %7 \n" 335 "st1 {v17.s}[1], [%0], %7 \n" 336 "st1 {v17.s}[2], [%0], %7 \n" 337 "st1 {v17.s}[3], [%0], %7 \n" 338 339 "add %0, %3, #4 \n" 340 "st1 {v19.s}[0], [%0], %7 \n" 341 "st1 {v19.s}[1], [%0], %7 \n" 342 "st1 {v19.s}[2], [%0], %7 \n" 343 "st1 {v19.s}[3], [%0] \n" 344 345 "add %1, %1, #8 \n" // src += 4 * 2 346 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * 347 // dst_stride_a 348 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * 349 // dst_stride_b 350 "subs %w4, %w4, #4 \n" // w -= 4 351 "b.eq 4f \n" 352 353 // some residual, check to see if it includes a 2x8 block, 354 // or less 355 "cmp %w4, #2 \n" 356 "b.lt 3f \n" 357 358 // 2x8 block 359 "2: \n" 360 "mov %0, %1 \n" 361 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 362 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 363 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 364 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 365 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 366 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 367 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 368 "ld2 {v2.h, v3.h}[3], [%0] \n" 369 370 "trn1 v4.8b, v0.8b, v2.8b \n" 371 "trn2 v5.8b, v0.8b, v2.8b \n" 372 "trn1 v6.8b, v1.8b, v3.8b \n" 373 "trn2 v7.8b, v1.8b, v3.8b \n" 374 375 "mov %0, %2 \n" 376 377 "st1 {v4.d}[0], [%0], %6 \n" 378 "st1 {v6.d}[0], [%0] \n" 379 380 "mov %0, %3 \n" 381 382 "st1 {v5.d}[0], [%0], %7 \n" 383 "st1 {v7.d}[0], [%0] \n" 384 385 "add %1, %1, #4 \n" // src += 2 * 2 386 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * 387 // dst_stride_a 388 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * 389 // dst_stride_b 390 "subs %w4, %w4, #2 \n" // w -= 2 391 "b.eq 4f \n" 392 393 // 1x8 block 394 "3: \n" 395 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 398 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 399 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 400 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 401 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 402 "ld2 {v0.b, v1.b}[7], [%1] \n" 403 404 "st1 {v0.d}[0], [%2] \n" 405 "st1 {v1.d}[0], [%3] \n" 406 407 "4: \n" 408 409 : "=&r"(src_temp), // %0 410 "+r"(src), // %1 411 "+r"(dst_a), // %2 412 "+r"(dst_b), // %3 413 "+r"(width) // %4 414 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 415 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 416 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 417 "r"(&kVTbl4x4TransposeDi) // %8 418 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 419 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); 418 420 } 419 421 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -
pjproject/trunk/third_party/yuv/source/rotate_win.cc
r5633 r5699 18 18 19 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 21 21 22 22 __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, … … 173 173 lea eax, [eax + 8 * edi + 16] 174 174 neg edi 175 // Second round of bit swap.175 // Second round of bit swap. 176 176 movdqa xmm5, xmm0 177 177 punpcklwd xmm0, xmm2 … … 193 193 movdqa xmm7, xmm6 194 194 195 // Third round of bit swap.196 // Write to the destination pointer.195 // Third round of bit swap. 196 // Write to the destination pointer. 197 197 movdqa xmm6, xmm0 198 198 punpckldq xmm0, xmm4 -
pjproject/trunk/third_party/yuv/source/row_any.cc
r5633 r5699 85 85 SS(r, DUVSHIFT) * BPP); \ 86 86 } 87 88 // Merge functions. 89 #ifdef HAS_MERGERGBROW_SSSE3 90 ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) 91 #endif 92 #ifdef HAS_MERGERGBROW_NEON 93 ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) 94 #endif 87 95 #ifdef HAS_I422TOYUY2ROW_SSE2 88 96 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) … … 622 630 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) 623 631 #endif 632 #ifdef HAS_ARGBEXTRACTALPHAROW_MSA 633 ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) 634 #endif 624 635 #undef ANY11 625 636 … … 746 757 ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7) 747 758 ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7) 759 #endif 760 #ifdef HAS_HALFFLOATROW_MSA 761 ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31) 748 762 #endif 749 763 #undef ANY11P16 … … 912 926 ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) 913 927 #endif 928 #ifdef HAS_SPLITUVROW_MSA 929 ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) 930 #endif 914 931 #ifdef HAS_ARGBTOUV444ROW_SSSE3 915 932 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) … … 934 951 #endif 935 952 #undef ANY12 953 954 // Any 1 to 3. Outputs RGB planes. 955 #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ 956 void NAMEANY(const uint8* src_ptr, uint8* dst_r, uint8* dst_g, uint8* dst_b, \ 957 int width) { \ 958 SIMD_ALIGNED(uint8 temp[16 * 6]); \ 959 memset(temp, 0, 16 * 3); /* for msan */ \ 960 int r = width & MASK; \ 961 int n = width & ~MASK; \ 962 if (n > 0) { \ 963 ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ 964 } \ 965 memcpy(temp, src_ptr + n * BPP, r * BPP); \ 966 ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ 967 memcpy(dst_r + n, temp + 16 * 3, r); \ 968 memcpy(dst_g + n, temp + 16 * 4, r); \ 969 memcpy(dst_b + n, temp + 16 * 5, r); \ 970 } 971 972 #ifdef HAS_SPLITRGBROW_SSSE3 973 ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) 974 #endif 975 #ifdef HAS_SPLITRGBROW_NEON 976 ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) 977 #endif 936 978 937 979 // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. -
pjproject/trunk/third_party/yuv/source/row_common.cc
r5633 r5699 1771 1771 } 1772 1772 1773 void SplitRGBRow_C(const uint8* src_rgb, 1774 uint8* dst_r, 1775 uint8* dst_g, 1776 uint8* dst_b, 1777 int width) { 1778 int x; 1779 for (x = 0; x < width; ++x) { 1780 dst_r[x] = src_rgb[0]; 1781 dst_g[x] = src_rgb[1]; 1782 dst_b[x] = src_rgb[2]; 1783 src_rgb += 3; 1784 } 1785 } 1786 1787 void MergeRGBRow_C(const uint8* src_r, 1788 const uint8* src_g, 1789 const uint8* src_b, 1790 uint8* dst_rgb, 1791 int width) { 1792 int x; 1793 for (x = 0; x < width; ++x) { 1794 dst_rgb[0] = src_r[x]; 1795 dst_rgb[1] = src_g[x]; 1796 dst_rgb[2] = src_b[x]; 1797 dst_rgb += 3; 1798 } 1799 } 1800 1801 void MergeUVRow_16_C(const uint16* src_u, 1802 const uint16* src_v, 1803 uint16* dst_uv, 1804 int scale, 1805 int width) { 1806 int x; 1807 for (x = 0; x < width - 1; x += 2) { 1808 dst_uv[0] = src_u[x] * scale; 1809 dst_uv[1] = src_v[x] * scale; 1810 dst_uv[2] = src_u[x + 1] * scale; 1811 dst_uv[3] = src_v[x + 1] * scale; 1812 dst_uv += 4; 1813 } 1814 if (width & 1) { 1815 dst_uv[0] = src_u[width - 1] * scale; 1816 dst_uv[1] = src_v[width - 1] * scale; 1817 } 1818 } 1819 1820 void MultiplyRow_16_C(const uint16* src_y, 1821 uint16* dst_y, 1822 int scale, 1823 int width) { 1824 int x; 1825 for (x = 0; x < width; ++x) { 1826 dst_y[x] = src_y[x] * scale; 1827 } 1828 } 1829 1773 1830 void CopyRow_C(const uint8* src, uint8* dst, int count) { 1774 1831 memcpy(dst, src, count); … … 2640 2697 #endif 2641 2698 2699 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { 2700 float fsum = 0.f; 2701 int i; 2702 #if defined(__clang__) 2703 #pragma clang loop vectorize_width(4) 2704 #endif 2705 for (i = 0; i < width; ++i) { 2706 float v = *src++; 2707 fsum += v * v; 2708 *dst++ = v * scale; 2709 } 2710 return fsum; 2711 } 2712 2713 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { 2714 float fmax = 0.f; 2715 int i; 2716 for (i = 0; i < width; ++i) { 2717 float v = *src++; 2718 float vs = v * scale; 2719 fmax = (v > fmax) ? v : fmax; 2720 *dst++ = vs; 2721 } 2722 return fmax; 2723 } 2724 2725 void ScaleSamples_C(const float* src, float* dst, float scale, int width) { 2726 int i; 2727 for (i = 0; i < width; ++i) { 2728 *dst++ = *src++ * scale; 2729 } 2730 } 2731 2732 void GaussRow_C(const uint32* src, uint16* dst, int width) { 2733 int i; 2734 for (i = 0; i < width; ++i) { 2735 *dst++ = 2736 (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; 2737 ++src; 2738 } 2739 } 2740 2741 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2742 void GaussCol_C(const uint16* src0, 2743 const uint16* src1, 2744 const uint16* src2, 2745 const uint16* src3, 2746 const uint16* src4, 2747 uint32* dst, 2748 int width) { 2749 int i; 2750 for (i = 0; i < width; ++i) { 2751 *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; 2752 } 2753 } 2754 2642 2755 #ifdef __cplusplus 2643 2756 } // extern "C" -
pjproject/trunk/third_party/yuv/source/row_gcc.cc
r5633 r5699 39 39 127, -84, -43, 0, 127, -84, -43, 0}; 40 40 41 static vec8 kARGBToV = { 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 43 }; 41 static vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, 42 -18, -94, 112, 0, -18, -94, 112, 0}; 44 43 45 44 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, … … 2754 2753 } 2755 2754 #endif // HAS_MERGEUVROW_SSE2 2755 2756 // Use scale to convert lsb formats to msb, depending how many bits there are: 2757 // 128 = 9 bits 2758 // 64 = 10 bits 2759 // 16 = 12 bits 2760 // 1 = 16 bits 2761 #ifdef HAS_MERGEUVROW_16_AVX2 2762 void MergeUVRow_16_AVX2(const uint16* src_u, 2763 const uint16* src_v, 2764 uint16* dst_uv, 2765 int scale, 2766 int width) { 2767 // clang-format off 2768 asm volatile ( 2769 "vmovd %4,%%xmm3 \n" 2770 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" 2771 "vbroadcastss %%xmm3,%%ymm3 \n" 2772 "sub %0,%1 \n" 2773 2774 // 16 pixels per loop. 2775 LABELALIGN 2776 "1: \n" 2777 "vmovdqu (%0),%%ymm0 \n" 2778 "vmovdqu (%0,%1,1),%%ymm1 \n" 2779 "add $0x20,%0 \n" 2780 2781 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" 2782 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" 2783 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates 2784 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" 2785 "vextractf128 $0x0,%%ymm2,(%2) \n" 2786 "vextractf128 $0x0,%%ymm0,0x10(%2) \n" 2787 "vextractf128 $0x1,%%ymm2,0x20(%2) \n" 2788 "vextractf128 $0x1,%%ymm0,0x30(%2) \n" 2789 "add $0x40,%2 \n" 2790 "sub $0x10,%3 \n" 2791 "jg 1b \n" 2792 "vzeroupper \n" 2793 : "+r"(src_u), // %0 2794 "+r"(src_v), // %1 2795 "+r"(dst_uv), // %2 2796 "+r"(width) // %3 2797 : "r"(scale) // %4 2798 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); 2799 // clang-format on 2800 } 2801 #endif // HAS_MERGEUVROW_AVX2 2802 2803 #ifdef HAS_MULTIPLYROW_16_AVX2 2804 void MultiplyRow_16_AVX2(const uint16* src_y, 2805 uint16* dst_y, 2806 int scale, 2807 int width) { 2808 // clang-format off 2809 asm volatile ( 2810 "vmovd %3,%%xmm3 \n" 2811 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" 2812 "vbroadcastss %%xmm3,%%ymm3 \n" 2813 "sub %0,%1 \n" 2814 2815 // 16 pixels per loop. 2816 LABELALIGN 2817 "1: \n" 2818 "vmovdqu (%0),%%ymm0 \n" 2819 "vmovdqu 0x20(%0),%%ymm1 \n" 2820 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" 2821 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" 2822 "vmovdqu %%ymm0,(%0,%1) \n" 2823 "vmovdqu %%ymm1,0x20(%0,%1) \n" 2824 "add $0x40,%0 \n" 2825 "sub $0x20,%2 \n" 2826 "jg 1b \n" 2827 "vzeroupper \n" 2828 : "+r"(src_y), // %0 2829 "+r"(dst_y), // %1 2830 "+r"(width) // %2 2831 : "r"(scale) // %3 2832 : "memory", "cc", "xmm0", "xmm1", "xmm3"); 2833 // clang-format on 2834 } 2835 #endif // HAS_MULTIPLYROW_16_AVX2 2836 2837 #ifdef HAS_SPLITRGBROW_SSSE3 2838 2839 // Shuffle table for converting RGB to Planar. 2840 static uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, 2841 128u, 128u, 128u, 128u, 128u, 128u, 2842 128u, 128u, 128u, 128u}; 2843 static uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, 2844 2u, 5u, 8u, 11u, 14u, 128u, 2845 128u, 128u, 128u, 128u}; 2846 static uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, 2847 128u, 128u, 128u, 128u, 128u, 1u, 2848 4u, 7u, 10u, 13u}; 2849 2850 static uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, 2851 128u, 128u, 128u, 128u, 128u, 128u, 2852 128u, 128u, 128u, 128u}; 2853 static uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, 2854 3u, 6u, 9u, 12u, 15u, 128u, 2855 128u, 128u, 128u, 128u}; 2856 static uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, 2857 128u, 128u, 128u, 128u, 128u, 2u, 2858 5u, 8u, 11u, 14u}; 2859 2860 static uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, 2861 128u, 128u, 128u, 128u, 128u, 128u, 2862 128u, 128u, 128u, 128u}; 2863 static uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, 2864 4u, 7u, 10u, 13u, 128u, 128u, 2865 128u, 128u, 128u, 128u}; 2866 static uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, 2867 128u, 128u, 128u, 128u, 0u, 3u, 2868 6u, 9u, 12u, 15u}; 2869 2870 void SplitRGBRow_SSSE3(const uint8* src_rgb, 2871 uint8* dst_r, 2872 uint8* dst_g, 2873 uint8* dst_b, 2874 int width) { 2875 asm volatile ( 2876 LABELALIGN 2877 "1: \n" 2878 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2879 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2880 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2881 "pshufb %5, %%xmm0 \n" 2882 "pshufb %6, %%xmm1 \n" 2883 "pshufb %7, %%xmm2 \n" 2884 "por %%xmm1,%%xmm0 \n" 2885 "por %%xmm2,%%xmm0 \n" 2886 "movdqu %%xmm0," MEMACCESS(1) " \n" 2887 "lea " MEMLEA(0x10,1) ",%1 \n" 2888 2889 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2890 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2891 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2892 "pshufb %8, %%xmm0 \n" 2893 "pshufb %9, %%xmm1 \n" 2894 "pshufb %10, %%xmm2 \n" 2895 "por %%xmm1,%%xmm0 \n" 2896 "por %%xmm2,%%xmm0 \n" 2897 "movdqu %%xmm0," MEMACCESS(2) " \n" 2898 "lea " MEMLEA(0x10,2) ",%2 \n" 2899 2900 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2901 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2902 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2903 "pshufb %11, %%xmm0 \n" 2904 "pshufb %12, %%xmm1 \n" 2905 "pshufb %13, %%xmm2 \n" 2906 "por %%xmm1,%%xmm0 \n" 2907 "por %%xmm2,%%xmm0 \n" 2908 "movdqu %%xmm0," MEMACCESS(3) " \n" 2909 "lea " MEMLEA(0x10,3) ",%3 \n" 2910 "lea " MEMLEA(0x30,0) ",%0 \n" 2911 "sub $0x10,%4 \n" 2912 "jg 1b \n" 2913 : "+r"(src_rgb), // %0 2914 "+r"(dst_r), // %1 2915 "+r"(dst_g), // %2 2916 "+r"(dst_b), // %3 2917 "+r"(width) // %4 2918 : "m"(kShuffleMaskRGBToR0), // %5 2919 "m"(kShuffleMaskRGBToR1), // %6 2920 "m"(kShuffleMaskRGBToR2), // %7 2921 "m"(kShuffleMaskRGBToG0), // %8 2922 "m"(kShuffleMaskRGBToG1), // %9 2923 "m"(kShuffleMaskRGBToG2), // %10 2924 "m"(kShuffleMaskRGBToB0), // %11 2925 "m"(kShuffleMaskRGBToB1), // %12 2926 "m"(kShuffleMaskRGBToB2) // %13 2927 : "memory", "cc", NACL_R14 2928 "xmm0", "xmm1", "xmm2" 2929 ); 2930 } 2931 #endif // HAS_SPLITRGBROW_SSSE3 2932 2933 #ifdef HAS_MERGERGBROW_SSSE3 2934 2935 // Shuffle table for converting RGB to Planar. 2936 static uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, 2937 2u, 128u, 128u, 3u, 128u, 128u, 2938 4u, 128u, 128u, 5u}; 2939 static uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, 2940 128u, 2u, 128u, 128u, 3u, 128u, 2941 128u, 4u, 128u, 128u}; 2942 static uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, 2943 128u, 128u, 2u, 128u, 128u, 3u, 2944 128u, 128u, 4u, 128u}; 2945 2946 static uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, 2947 7u, 128u, 128u, 8u, 128u, 128u, 2948 9u, 128u, 128u, 10u}; 2949 static uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, 2950 128u, 7u, 128u, 128u, 8u, 128u, 2951 128u, 9u, 128u, 128u}; 2952 static uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, 2953 128u, 128u, 8u, 128u, 128u, 9u, 2954 128u, 128u, 10u, 128u}; 2955 2956 static uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, 2957 12u, 128u, 128u, 13u, 128u, 128u, 2958 14u, 128u, 128u, 15u}; 2959 static uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, 2960 128u, 13u, 128u, 128u, 14u, 128u, 2961 128u, 15u, 128u, 128u}; 2962 static uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, 2963 128u, 128u, 13u, 128u, 128u, 14u, 2964 128u, 128u, 15u, 128u}; 2965 2966 void MergeRGBRow_SSSE3(const uint8* src_r, 2967 const uint8* src_g, 2968 const uint8* src_b, 2969 uint8* dst_rgb, 2970 int width) { 2971 asm volatile ( 2972 LABELALIGN 2973 "1: \n" 2974 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2975 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2976 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2977 "pshufb %5, %%xmm0 \n" 2978 "pshufb %6, %%xmm1 \n" 2979 "pshufb %7, %%xmm2 \n" 2980 "por %%xmm1,%%xmm0 \n" 2981 "por %%xmm2,%%xmm0 \n" 2982 "movdqu %%xmm0," MEMACCESS(3) " \n" 2983 2984 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2985 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2986 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2987 "pshufb %8, %%xmm0 \n" 2988 "pshufb %9, %%xmm1 \n" 2989 "pshufb %10, %%xmm2 \n" 2990 "por %%xmm1,%%xmm0 \n" 2991 "por %%xmm2,%%xmm0 \n" 2992 "movdqu %%xmm0," MEMACCESS2(16, 3) " \n" 2993 2994 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2995 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 2996 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 2997 "pshufb %11, %%xmm0 \n" 2998 "pshufb %12, %%xmm1 \n" 2999 "pshufb %13, %%xmm2 \n" 3000 "por %%xmm1,%%xmm0 \n" 3001 "por %%xmm2,%%xmm0 \n" 3002 "movdqu %%xmm0," MEMACCESS2(32, 3) " \n" 3003 3004 "lea " MEMLEA(0x10,0) ",%0 \n" 3005 "lea " MEMLEA(0x10,1) ",%1 \n" 3006 "lea " MEMLEA(0x10,2) ",%2 \n" 3007 "lea " MEMLEA(0x30,3) ",%3 \n" 3008 "sub $0x10,%4 \n" 3009 "jg 1b \n" 3010 : "+r"(src_r), // %0 3011 "+r"(src_g), // %1 3012 "+r"(src_b), // %2 3013 "+r"(dst_rgb), // %3 3014 "+r"(width) // %4 3015 : "m"(kShuffleMaskRToRGB0), // %5 3016 "m"(kShuffleMaskGToRGB0), // %6 3017 "m"(kShuffleMaskBToRGB0), // %7 3018 "m"(kShuffleMaskRToRGB1), // %8 3019 "m"(kShuffleMaskGToRGB1), // %9 3020 "m"(kShuffleMaskBToRGB1), // %10 3021 "m"(kShuffleMaskRToRGB2), // %11 3022 "m"(kShuffleMaskGToRGB2), // %12 3023 "m"(kShuffleMaskBToRGB2) // %13 3024 : "memory", "cc", NACL_R14 3025 "xmm0", "xmm1", "xmm2" 3026 ); 3027 } 3028 #endif // HAS_MERGERGBROW_SSSE3 2756 3029 2757 3030 #ifdef HAS_COPYROW_SSE2 … … 5454 5727 static float kScaleBias = 1.9259299444e-34f; 5455 5728 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { 5729 scale *= kScaleBias; 5456 5730 asm volatile ( 5457 5731 "pshufd $0x0,%3,%%xmm4 \n" … … 5480 5754 "+r"(dst), // %1 5481 5755 "+r"(width) // %2 5482 : "x"(scale * kScaleBias) // %3 5756 #if defined(__x86_64__) 5757 : "x"(scale) // %3 5758 #else 5759 : "m"(scale) // %3 5760 #endif 5483 5761 : "memory", "cc", 5484 5762 "xmm2", "xmm3", "xmm4", "xmm5" … … 5489 5767 #ifdef HAS_HALFFLOATROW_AVX2 5490 5768 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5769 scale *= kScaleBias; 5491 5770 asm volatile ( 5492 5771 "vbroadcastss %3, %%ymm4 \n" … … 5516 5795 "+r"(dst), // %1 5517 5796 "+r"(width) // %2 5518 : "x"(scale * kScaleBias) // %3 5797 #if defined(__x86_64__) 5798 : "x"(scale) // %3 5799 #else 5800 : "m"(scale) // %3 5801 #endif 5519 5802 : "memory", "cc", 5520 5803 "xmm2", "xmm3", "xmm4", "xmm5" … … 5549 5832 "+r"(dst), // %1 5550 5833 "+r"(width) // %2 5834 #if defined(__x86_64__) 5551 5835 : "x"(scale) // %3 5836 #else 5837 : "m"(scale) // %3 5838 #endif 5552 5839 : "memory", "cc", 5553 5840 "xmm2", "xmm3", "xmm4" -
pjproject/trunk/third_party/yuv/source/row_msa.cc
r5633 r5699 2918 2918 void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { 2919 2919 int x; 2920 v 16u8 dst0 = (v16u8)__msa_fill_w(v32);2920 v4i32 dst0 = __builtin_msa_fill_w(v32); 2921 2921 2922 2922 for (x = 0; x < width; x += 4) { … … 2970 2970 } 2971 2971 2972 void ARGBExtractAlphaRow_MSA(const uint8* src_argb, uint8* dst_a, int width) { 2973 int i; 2974 v16u8 src0, src1, src2, src3, vec0, vec1, dst0; 2975 2976 for (i = 0; i < width; i += 16) { 2977 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 2978 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 2979 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); 2980 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); 2981 vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 2982 vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 2983 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 2984 ST_UB(dst0, dst_a); 2985 src_argb += 64; 2986 dst_a += 16; 2987 } 2988 } 2989 2990 void ARGBBlendRow_MSA(const uint8* src_argb0, 2991 const uint8* src_argb1, 2992 uint8* dst_argb, 2993 int width) { 2994 int x; 2995 v16u8 src0, src1, src2, src3, dst0, dst1; 2996 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2997 v8u16 vec8, vec9, vec10, vec11, vec12, vec13; 2998 v8u16 const_256 = (v8u16)__msa_ldi_h(256); 2999 v16u8 const_255 = (v16u8)__msa_ldi_b(255); 3000 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; 3001 v16i8 zero = {0}; 3002 3003 for (x = 0; x < width; x += 8) { 3004 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 3005 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 3006 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); 3007 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); 3008 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); 3009 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); 3010 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); 3011 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); 3012 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); 3013 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); 3014 vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); 3015 vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); 3016 vec8 = (v8u16)__msa_fill_h(vec0[3]); 3017 vec9 = (v8u16)__msa_fill_h(vec0[7]); 3018 vec10 = (v8u16)__msa_fill_h(vec1[3]); 3019 vec11 = (v8u16)__msa_fill_h(vec1[7]); 3020 vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); 3021 vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); 3022 vec10 = (v8u16)__msa_fill_h(vec2[3]); 3023 vec11 = (v8u16)__msa_fill_h(vec2[7]); 3024 vec12 = (v8u16)__msa_fill_h(vec3[3]); 3025 vec13 = (v8u16)__msa_fill_h(vec3[7]); 3026 vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); 3027 vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); 3028 vec8 = const_256 - vec8; 3029 vec9 = const_256 - vec9; 3030 vec10 = const_256 - vec10; 3031 vec11 = const_256 - vec11; 3032 vec8 *= vec4; 3033 vec9 *= vec5; 3034 vec10 *= vec6; 3035 vec11 *= vec7; 3036 vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); 3037 vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); 3038 vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); 3039 vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); 3040 vec0 += vec8; 3041 vec1 += vec9; 3042 vec2 += vec10; 3043 vec3 += vec11; 3044 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3045 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 3046 dst0 = __msa_bmnz_v(dst0, const_255, mask); 3047 dst1 = __msa_bmnz_v(dst1, const_255, mask); 3048 ST_UB2(dst0, dst1, dst_argb, 16); 3049 src_argb0 += 32; 3050 src_argb1 += 32; 3051 dst_argb += 32; 3052 } 3053 } 3054 3055 void ARGBQuantizeRow_MSA(uint8* dst_argb, 3056 int scale, 3057 int interval_size, 3058 int interval_offset, 3059 int width) { 3060 int x; 3061 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3062 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3063 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3064 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 3065 v4i32 vec_scale = __msa_fill_w(scale); 3066 v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); 3067 v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); 3068 v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; 3069 v16i8 zero = {0}; 3070 3071 for (x = 0; x < width; x += 8) { 3072 src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0); 3073 src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16); 3074 src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32); 3075 src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48); 3076 vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); 3077 vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); 3078 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); 3079 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); 3080 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); 3081 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); 3082 vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); 3083 vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); 3084 tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); 3085 tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); 3086 tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); 3087 tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); 3088 tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); 3089 tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); 3090 tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); 3091 tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); 3092 tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); 3093 tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); 3094 tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); 3095 tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); 3096 tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); 3097 tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); 3098 tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); 3099 tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); 3100 tmp0 *= vec_scale; 3101 tmp1 *= vec_scale; 3102 tmp2 *= vec_scale; 3103 tmp3 *= vec_scale; 3104 tmp4 *= vec_scale; 3105 tmp5 *= vec_scale; 3106 tmp6 *= vec_scale; 3107 tmp7 *= vec_scale; 3108 tmp8 *= vec_scale; 3109 tmp9 *= vec_scale; 3110 tmp10 *= vec_scale; 3111 tmp11 *= vec_scale; 3112 tmp12 *= vec_scale; 3113 tmp13 *= vec_scale; 3114 tmp14 *= vec_scale; 3115 tmp15 *= vec_scale; 3116 tmp0 >>= 16; 3117 tmp1 >>= 16; 3118 tmp2 >>= 16; 3119 tmp3 >>= 16; 3120 tmp4 >>= 16; 3121 tmp5 >>= 16; 3122 tmp6 >>= 16; 3123 tmp7 >>= 16; 3124 tmp8 >>= 16; 3125 tmp9 >>= 16; 3126 tmp10 >>= 16; 3127 tmp11 >>= 16; 3128 tmp12 >>= 16; 3129 tmp13 >>= 16; 3130 tmp14 >>= 16; 3131 tmp15 >>= 16; 3132 vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3133 vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3134 vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3135 vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3136 vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); 3137 vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); 3138 vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); 3139 vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); 3140 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3141 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 3142 dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 3143 dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); 3144 dst0 *= vec_int_sz; 3145 dst1 *= vec_int_sz; 3146 dst2 *= vec_int_sz; 3147 dst3 *= vec_int_sz; 3148 dst0 += vec_int_ofst; 3149 dst1 += vec_int_ofst; 3150 dst2 += vec_int_ofst; 3151 dst3 += vec_int_ofst; 3152 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); 3153 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); 3154 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); 3155 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); 3156 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 3157 dst_argb += 64; 3158 } 3159 } 3160 3161 void ARGBColorMatrixRow_MSA(const uint8* src_argb, 3162 uint8* dst_argb, 3163 const int8* matrix_argb, 3164 int width) { 3165 int32 x; 3166 v16i8 src0; 3167 v16u8 src1, src2, dst0, dst1; 3168 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 3169 v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; 3170 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3171 v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 3172 v16i8 zero = {0}; 3173 v8i16 max = __msa_ldi_h(255); 3174 3175 src0 = __msa_ld_b((v16i8*)matrix_argb, 0); 3176 vec0 = (v8i16)__msa_ilvr_b(zero, src0); 3177 vec1 = (v8i16)__msa_ilvl_b(zero, src0); 3178 3179 for (x = 0; x < width; x += 8) { 3180 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 3181 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 3182 vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); 3183 vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); 3184 vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); 3185 vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); 3186 vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); 3187 vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); 3188 vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); 3189 vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); 3190 vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); 3191 vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); 3192 vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); 3193 vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); 3194 vec10 = vec2 * vec0; 3195 vec11 = vec2 * vec1; 3196 vec12 = vec6 * vec0; 3197 vec13 = vec6 * vec1; 3198 tmp0 = __msa_hadd_s_w(vec10, vec10); 3199 tmp1 = __msa_hadd_s_w(vec11, vec11); 3200 tmp2 = __msa_hadd_s_w(vec12, vec12); 3201 tmp3 = __msa_hadd_s_w(vec13, vec13); 3202 vec14 = vec3 * vec0; 3203 vec15 = vec3 * vec1; 3204 vec16 = vec7 * vec0; 3205 vec17 = vec7 * vec1; 3206 tmp4 = __msa_hadd_s_w(vec14, vec14); 3207 tmp5 = __msa_hadd_s_w(vec15, vec15); 3208 tmp6 = __msa_hadd_s_w(vec16, vec16); 3209 tmp7 = __msa_hadd_s_w(vec17, vec17); 3210 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3211 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3212 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3213 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3214 tmp0 = __msa_hadd_s_w(vec10, vec10); 3215 tmp1 = __msa_hadd_s_w(vec11, vec11); 3216 tmp2 = __msa_hadd_s_w(vec12, vec12); 3217 tmp3 = __msa_hadd_s_w(vec13, vec13); 3218 tmp0 = __msa_srai_w(tmp0, 6); 3219 tmp1 = __msa_srai_w(tmp1, 6); 3220 tmp2 = __msa_srai_w(tmp2, 6); 3221 tmp3 = __msa_srai_w(tmp3, 6); 3222 vec2 = vec4 * vec0; 3223 vec6 = vec4 * vec1; 3224 vec3 = vec8 * vec0; 3225 vec7 = vec8 * vec1; 3226 tmp8 = __msa_hadd_s_w(vec2, vec2); 3227 tmp9 = __msa_hadd_s_w(vec6, vec6); 3228 tmp10 = __msa_hadd_s_w(vec3, vec3); 3229 tmp11 = __msa_hadd_s_w(vec7, vec7); 3230 vec4 = vec5 * vec0; 3231 vec8 = vec5 * vec1; 3232 vec5 = vec9 * vec0; 3233 vec9 = vec9 * vec1; 3234 tmp12 = __msa_hadd_s_w(vec4, vec4); 3235 tmp13 = __msa_hadd_s_w(vec8, vec8); 3236 tmp14 = __msa_hadd_s_w(vec5, vec5); 3237 tmp15 = __msa_hadd_s_w(vec9, vec9); 3238 vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); 3239 vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); 3240 vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); 3241 vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); 3242 tmp4 = __msa_hadd_s_w(vec14, vec14); 3243 tmp5 = __msa_hadd_s_w(vec15, vec15); 3244 tmp6 = __msa_hadd_s_w(vec16, vec16); 3245 tmp7 = __msa_hadd_s_w(vec17, vec17); 3246 tmp4 = __msa_srai_w(tmp4, 6); 3247 tmp5 = __msa_srai_w(tmp5, 6); 3248 tmp6 = __msa_srai_w(tmp6, 6); 3249 tmp7 = __msa_srai_w(tmp7, 6); 3250 vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 3251 vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 3252 vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); 3253 vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); 3254 vec10 = __msa_maxi_s_h(vec10, 0); 3255 vec11 = __msa_maxi_s_h(vec11, 0); 3256 vec12 = __msa_maxi_s_h(vec12, 0); 3257 vec13 = __msa_maxi_s_h(vec13, 0); 3258 vec10 = __msa_min_s_h(vec10, max); 3259 vec11 = __msa_min_s_h(vec11, max); 3260 vec12 = __msa_min_s_h(vec12, max); 3261 vec13 = __msa_min_s_h(vec13, max); 3262 dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); 3263 dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); 3264 ST_UB2(dst0, dst1, dst_argb, 16); 3265 src_argb += 32; 3266 dst_argb += 32; 3267 } 3268 } 3269 3270 void SplitUVRow_MSA(const uint8* src_uv, 3271 uint8* dst_u, 3272 uint8* dst_v, 3273 int width) { 3274 int x; 3275 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3276 3277 for (x = 0; x < width; x += 32) { 3278 src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); 3279 src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); 3280 src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); 3281 src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); 3282 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 3283 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 3284 dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 3285 dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 3286 ST_UB2(dst0, dst1, dst_u, 16); 3287 ST_UB2(dst2, dst3, dst_v, 16); 3288 src_uv += 64; 3289 dst_u += 32; 3290 dst_v += 32; 3291 } 3292 } 3293 3294 void SetRow_MSA(uint8* dst, uint8 v8, int width) { 3295 int x; 3296 v16u8 dst0 = (v16u8)__msa_fill_b(v8); 3297 3298 for (x = 0; x < width; x += 16) { 3299 ST_UB(dst0, dst); 3300 dst += 16; 3301 } 3302 } 3303 3304 void MirrorUVRow_MSA(const uint8* src_uv, 3305 uint8* dst_u, 3306 uint8* dst_v, 3307 int width) { 3308 int x; 3309 v16u8 src0, src1, src2, src3; 3310 v16u8 dst0, dst1, dst2, dst3; 3311 v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; 3312 v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; 3313 3314 src_uv += (2 * width); 3315 3316 for (x = 0; x < width; x += 32) { 3317 src_uv -= 64; 3318 src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); 3319 src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); 3320 src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); 3321 src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); 3322 dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 3323 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 3324 dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 3325 dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); 3326 ST_UB2(dst0, dst1, dst_v, 16); 3327 ST_UB2(dst2, dst3, dst_u, 16); 3328 dst_u += 32; 3329 dst_v += 32; 3330 } 3331 } 3332 3333 void SobelXRow_MSA(const uint8* src_y0, 3334 const uint8* src_y1, 3335 const uint8* src_y2, 3336 uint8* dst_sobelx, 3337 int32 width) { 3338 int x; 3339 v16u8 src0, src1, src2, src3, src4, src5, dst0; 3340 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 3341 v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; 3342 v16i8 tmp = __msa_ldi_b(8); 3343 v16i8 mask1 = mask0 + tmp; 3344 v8i16 zero = {0}; 3345 v8i16 max = __msa_ldi_h(255); 3346 3347 for (x = 0; x < width; x += 16) { 3348 src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); 3349 src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16); 3350 src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); 3351 src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16); 3352 src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0); 3353 src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16); 3354 vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 3355 vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 3356 vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); 3357 vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 3358 vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); 3359 vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 3360 vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); 3361 vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); 3362 vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); 3363 vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); 3364 vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); 3365 vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); 3366 vec0 += vec2; 3367 vec1 += vec3; 3368 vec4 += vec2; 3369 vec5 += vec3; 3370 vec0 += vec4; 3371 vec1 += vec5; 3372 vec0 = __msa_add_a_h(zero, vec0); 3373 vec1 = __msa_add_a_h(zero, vec1); 3374 vec0 = __msa_maxi_s_h(vec0, 0); 3375 vec1 = __msa_maxi_s_h(vec1, 0); 3376 vec0 = __msa_min_s_h(max, vec0); 3377 vec1 = __msa_min_s_h(max, vec1); 3378 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3379 ST_UB(dst0, dst_sobelx); 3380 src_y0 += 16; 3381 src_y1 += 16; 3382 src_y2 += 16; 3383 dst_sobelx += 16; 3384 } 3385 } 3386 3387 void SobelYRow_MSA(const uint8* src_y0, 3388 const uint8* src_y1, 3389 uint8* dst_sobely, 3390 int32 width) { 3391 int x; 3392 v16u8 src0, src1, dst0; 3393 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; 3394 v8i16 zero = {0}; 3395 v8i16 max = __msa_ldi_h(255); 3396 3397 for (x = 0; x < width; x += 16) { 3398 src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); 3399 src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); 3400 vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); 3401 vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); 3402 vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); 3403 vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); 3404 vec0 -= vec2; 3405 vec1 -= vec3; 3406 vec6[0] = src_y0[16] - src_y1[16]; 3407 vec6[1] = src_y0[17] - src_y1[17]; 3408 vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); 3409 vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); 3410 vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); 3411 vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); 3412 vec0 += vec2; 3413 vec1 += vec3; 3414 vec4 += vec2; 3415 vec5 += vec3; 3416 vec0 += vec4; 3417 vec1 += vec5; 3418 vec0 = __msa_add_a_h(zero, vec0); 3419 vec1 = __msa_add_a_h(zero, vec1); 3420 vec0 = __msa_maxi_s_h(vec0, 0); 3421 vec1 = __msa_maxi_s_h(vec1, 0); 3422 vec0 = __msa_min_s_h(max, vec0); 3423 vec1 = __msa_min_s_h(max, vec1); 3424 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 3425 ST_UB(dst0, dst_sobely); 3426 src_y0 += 16; 3427 src_y1 += 16; 3428 dst_sobely += 16; 3429 } 3430 } 3431 3432 void HalfFloatRow_MSA(const uint16* src, uint16* dst, float scale, int width) { 3433 int i; 3434 v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; 3435 v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 3436 v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; 3437 v4f32 mult_vec; 3438 v8i16 zero = {0}; 3439 mult_vec[0] = 1.9259299444e-34f * scale; 3440 mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); 3441 3442 for (i = 0; i < width; i += 32) { 3443 src0 = (v8u16)__msa_ld_h((v8i16*)src, 0); 3444 src1 = (v8u16)__msa_ld_h((v8i16*)src, 16); 3445 src2 = (v8u16)__msa_ld_h((v8i16*)src, 32); 3446 src3 = (v8u16)__msa_ld_h((v8i16*)src, 48); 3447 vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); 3448 vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); 3449 vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); 3450 vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); 3451 vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); 3452 vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); 3453 vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); 3454 vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); 3455 fvec0 = __msa_ffint_u_w(vec0); 3456 fvec1 = __msa_ffint_u_w(vec1); 3457 fvec2 = __msa_ffint_u_w(vec2); 3458 fvec3 = __msa_ffint_u_w(vec3); 3459 fvec4 = __msa_ffint_u_w(vec4); 3460 fvec5 = __msa_ffint_u_w(vec5); 3461 fvec6 = __msa_ffint_u_w(vec6); 3462 fvec7 = __msa_ffint_u_w(vec7); 3463 fvec0 *= mult_vec; 3464 fvec1 *= mult_vec; 3465 fvec2 *= mult_vec; 3466 fvec3 *= mult_vec; 3467 fvec4 *= mult_vec; 3468 fvec5 *= mult_vec; 3469 fvec6 *= mult_vec; 3470 fvec7 *= mult_vec; 3471 vec0 = ((v4u32)fvec0) >> 13; 3472 vec1 = ((v4u32)fvec1) >> 13; 3473 vec2 = ((v4u32)fvec2) >> 13; 3474 vec3 = ((v4u32)fvec3) >> 13; 3475 vec4 = ((v4u32)fvec4) >> 13; 3476 vec5 = ((v4u32)fvec5) >> 13; 3477 vec6 = ((v4u32)fvec6) >> 13; 3478 vec7 = ((v4u32)fvec7) >> 13; 3479 dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); 3480 dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); 3481 dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); 3482 dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); 3483 ST_UH2(dst0, dst1, dst, 8); 3484 ST_UH2(dst2, dst3, dst + 16, 8); 3485 src += 32; 3486 dst += 32; 3487 } 3488 } 3489 2972 3490 #ifdef __cplusplus 2973 3491 } // extern "C" -
pjproject/trunk/third_party/yuv/source/row_neon.cc
r5633 r5699 116 116 YUVTORGB_SETUP 117 117 "vmov.u8 d23, #255 \n" 118 "1: 118 "1: \n" READYUV444 YUVTORGB 119 119 "subs %4, %4, #8 \n" 120 120 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" … … 142 142 YUVTORGB_SETUP 143 143 "vmov.u8 d23, #255 \n" 144 "1: 144 "1: \n" READYUV422 YUVTORGB 145 145 "subs %4, %4, #8 \n" 146 146 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" … … 168 168 asm volatile( 169 169 YUVTORGB_SETUP 170 "1: 170 "1: \n" READYUV422 YUVTORGB 171 171 "subs %5, %5, #8 \n" 172 172 "vld1.8 {d23}, [%3]! \n" … … 195 195 asm volatile( 196 196 YUVTORGB_SETUP 197 "1: 197 "1: \n" READYUV422 YUVTORGB 198 198 "subs %4, %4, #8 \n" 199 199 "vmov.u8 d19, #255 \n" // d19 modified by … … 222 222 asm volatile( 223 223 YUVTORGB_SETUP 224 "1: 224 "1: \n" READYUV422 YUVTORGB 225 225 "subs %4, %4, #8 \n" 226 226 "vst3.8 {d20, d21, d22}, [%3]! \n" … … 254 254 asm volatile( 255 255 YUVTORGB_SETUP 256 "1: 256 "1: \n" READYUV422 YUVTORGB 257 257 "subs %4, %4, #8 \n" ARGBTORGB565 258 258 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. … … 288 288 asm volatile( 289 289 YUVTORGB_SETUP 290 "1: 290 "1: \n" READYUV422 YUVTORGB 291 291 "subs %4, %4, #8 \n" 292 292 "vmov.u8 d23, #255 \n" ARGBTOARGB1555 … … 326 326 "vmov.u8 d4, #0x0f \n" // bits to clear with 327 327 // vbic. 328 "1: 328 "1: \n" READYUV422 YUVTORGB 329 329 "subs %4, %4, #8 \n" 330 330 "vmov.u8 d23, #255 \n" ARGBTOARGB4444 … … 349 349 YUVTORGB_SETUP 350 350 "vmov.u8 d23, #255 \n" 351 "1: 351 "1: \n" READYUV400 YUVTORGB 352 352 "subs %2, %2, #8 \n" 353 353 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" … … 367 367 asm volatile( 368 368 "vmov.u8 d23, #255 \n" 369 "1: 369 "1: \n" 370 370 "vld1.8 {d20}, [%0]! \n" 371 371 "vmov d21, d20 \n" … … 386 386 const struct YuvConstants* yuvconstants, 387 387 int width) { 388 asm volatile( 389 YUVTORGB_SETUP 390 "vmov.u8 d23, #255 \n" 391 "1: \n" READNV12 YUVTORGB 392 "subs %3, %3, #8 \n" 393 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 394 "bgt 1b \n" 395 : "+r"(src_y), // %0 396 "+r"(src_uv), // %1 397 "+r"(dst_argb), // %2 398 "+r"(width) // %3 399 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 400 [kUVToG] "r"(&yuvconstants->kUVToG), 401 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 402 [kYToRgb] "r"(&yuvconstants->kYToRgb) 403 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 404 "q12", "q13", "q14", "q15"); 388 asm volatile(YUVTORGB_SETUP 389 "vmov.u8 d23, #255 \n" 390 "1: \n" READNV12 YUVTORGB 391 "subs %3, %3, #8 \n" 392 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 393 "bgt 1b \n" 394 : "+r"(src_y), // %0 395 "+r"(src_uv), // %1 396 "+r"(dst_argb), // %2 397 "+r"(width) // %3 398 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 399 [kUVToG] "r"(&yuvconstants->kUVToG), 400 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 401 [kYToRgb] "r"(&yuvconstants->kYToRgb) 402 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 403 "q10", "q11", "q12", "q13", "q14", "q15"); 405 404 } 406 405 … … 410 409 const struct YuvConstants* yuvconstants, 411 410 int width) { 412 asm volatile( 413 YUVTORGB_SETUP 414 "vmov.u8 d23, #255 \n" 415 "1: \n" READNV21 YUVTORGB 416 "subs %3, %3, #8 \n" 417 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 418 "bgt 1b \n" 419 : "+r"(src_y), // %0 420 "+r"(src_vu), // %1 421 "+r"(dst_argb), // %2 422 "+r"(width) // %3 423 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 424 [kUVToG] "r"(&yuvconstants->kUVToG), 425 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 426 [kYToRgb] "r"(&yuvconstants->kYToRgb) 427 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 428 "q12", "q13", "q14", "q15"); 411 asm volatile(YUVTORGB_SETUP 412 "vmov.u8 d23, #255 \n" 413 "1: \n" READNV21 YUVTORGB 414 "subs %3, %3, #8 \n" 415 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 416 "bgt 1b \n" 417 : "+r"(src_y), // %0 418 "+r"(src_vu), // %1 419 "+r"(dst_argb), // %2 420 "+r"(width) // %3 421 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 422 [kUVToG] "r"(&yuvconstants->kUVToG), 423 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 424 [kYToRgb] "r"(&yuvconstants->kYToRgb) 425 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 426 "q10", "q11", "q12", "q13", "q14", "q15"); 429 427 } 430 428 … … 436 434 asm volatile( 437 435 YUVTORGB_SETUP 438 "1: 436 "1: \n" READNV12 YUVTORGB 439 437 "subs %3, %3, #8 \n" ARGBTORGB565 440 438 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. … … 456 454 const struct YuvConstants* yuvconstants, 457 455 int width) { 458 asm volatile( 459 YUVTORGB_SETUP 460 "vmov.u8 d23, #255 \n" 461 "1: \n" READYUY2 YUVTORGB 462 "subs %2, %2, #8 \n" 463 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 464 "bgt 1b \n" 465 : "+r"(src_yuy2), // %0 466 "+r"(dst_argb), // %1 467 "+r"(width) // %2 468 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 469 [kUVToG] "r"(&yuvconstants->kUVToG), 470 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 471 [kYToRgb] "r"(&yuvconstants->kYToRgb) 472 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 473 "q12", "q13", "q14", "q15"); 456 asm volatile(YUVTORGB_SETUP 457 "vmov.u8 d23, #255 \n" 458 "1: \n" READYUY2 YUVTORGB 459 "subs %2, %2, #8 \n" 460 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 461 "bgt 1b \n" 462 : "+r"(src_yuy2), // %0 463 "+r"(dst_argb), // %1 464 "+r"(width) // %2 465 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 466 [kUVToG] "r"(&yuvconstants->kUVToG), 467 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 468 [kYToRgb] "r"(&yuvconstants->kYToRgb) 469 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 470 "q10", "q11", "q12", "q13", "q14", "q15"); 474 471 } 475 472 … … 478 475 const struct YuvConstants* yuvconstants, 479 476 int width) { 480 asm volatile( 481 YUVTORGB_SETUP 482 "vmov.u8 d23, #255 \n" 483 "1: \n" READUYVY YUVTORGB 484 "subs %2, %2, #8 \n" 485 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 486 "bgt 1b \n" 487 : "+r"(src_uyvy), // %0 488 "+r"(dst_argb), // %1 489 "+r"(width) // %2 490 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 491 [kUVToG] "r"(&yuvconstants->kUVToG), 492 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 493 [kYToRgb] "r"(&yuvconstants->kYToRgb) 494 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", 495 "q12", "q13", "q14", "q15"); 477 asm volatile(YUVTORGB_SETUP 478 "vmov.u8 d23, #255 \n" 479 "1: \n" READUYVY YUVTORGB 480 "subs %2, %2, #8 \n" 481 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 482 "bgt 1b \n" 483 : "+r"(src_uyvy), // %0 484 "+r"(dst_argb), // %1 485 "+r"(width) // %2 486 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 487 [kUVToG] "r"(&yuvconstants->kUVToG), 488 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 489 [kYToRgb] "r"(&yuvconstants->kYToRgb) 490 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", 491 "q10", "q11", "q12", "q13", "q14", "q15"); 496 492 } 497 493 … … 502 498 int width) { 503 499 asm volatile( 504 "1: 500 "1: \n" 505 501 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV 506 502 "subs %3, %3, #16 \n" // 16 processed per loop … … 523 519 int width) { 524 520 asm volatile( 525 "1: 521 "1: \n" 526 522 "vld1.8 {q0}, [%0]! \n" // load U 527 523 "vld1.8 {q1}, [%1]! \n" // load V 528 524 "subs %3, %3, #16 \n" // 16 processed per loop 529 "vst2. u8{q0, q1}, [%2]! \n" // store 16 pairs of UV525 "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV 530 526 "bgt 1b \n" 531 527 : "+r"(src_u), // %0 … … 538 534 } 539 535 536 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. 537 void SplitRGBRow_NEON(const uint8* src_rgb, 538 uint8* dst_r, 539 uint8* dst_g, 540 uint8* dst_b, 541 int width) { 542 asm volatile( 543 "1: \n" 544 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB 545 "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB 546 "subs %4, %4, #16 \n" // 16 processed per loop 547 "vst1.8 {q0}, [%1]! \n" // store R 548 "vst1.8 {q1}, [%2]! \n" // store G 549 "vst1.8 {q2}, [%3]! \n" // store B 550 "bgt 1b \n" 551 : "+r"(src_rgb), // %0 552 "+r"(dst_r), // %1 553 "+r"(dst_g), // %2 554 "+r"(dst_b), // %3 555 "+r"(width) // %4 556 : // Input registers 557 : "cc", "memory", "d0", "d1", "d2" // Clobber List 558 ); 559 } 560 561 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time 562 void MergeRGBRow_NEON(const uint8* src_r, 563 const uint8* src_g, 564 const uint8* src_b, 565 uint8* dst_rgb, 566 int width) { 567 asm volatile( 568 "1: \n" 569 "vld1.8 {q0}, [%0]! \n" // load R 570 "vld1.8 {q1}, [%1]! \n" // load G 571 "vld1.8 {q2}, [%2]! \n" // load B 572 "subs %4, %4, #16 \n" // 16 processed per loop 573 "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB 574 "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB 575 "bgt 1b \n" 576 : "+r"(src_r), // %0 577 "+r"(src_g), // %1 578 "+r"(src_b), // %2 579 "+r"(dst_rgb), // %3 580 "+r"(width) // %4 581 : // Input registers 582 : "cc", "memory", "q0", "q1", "q2" // Clobber List 583 ); 584 } 585 540 586 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 541 587 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 542 588 asm volatile( 543 "1: 589 "1: \n" 544 590 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 545 591 "subs %2, %2, #32 \n" // 32 processed per loop … … 558 604 asm volatile( 559 605 "vdup.8 q0, %2 \n" // duplicate 16 bytes 560 "1: 606 "1: \n" 561 607 "subs %1, %1, #16 \n" // 16 bytes per loop 562 608 "vst1.8 {q0}, [%0]! \n" // store … … 572 618 asm volatile( 573 619 "vdup.u32 q0, %2 \n" // duplicate 4 ints 574 "1: 620 "1: \n" 575 621 "subs %1, %1, #4 \n" // 4 pixels per loop 576 622 "vst1.8 {q0}, [%0]! \n" // store … … 589 635 "sub %0, #16 \n" 590 636 591 "1: 637 "1: \n" 592 638 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 593 639 "subs %2, #16 \n" // 16 pixels per loop. … … 613 659 "sub %0, #16 \n" 614 660 615 "1: 661 "1: \n" 616 662 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 617 663 "subs %3, #8 \n" // 8 pixels per loop. … … 635 681 "sub %0, #16 \n" 636 682 637 "1: 683 "1: \n" 638 684 "vld1.8 {q0}, [%0], r3 \n" // src -= 16 639 685 "subs %2, #4 \n" // 4 pixels per loop. … … 652 698 asm volatile( 653 699 "vmov.u8 d4, #255 \n" // Alpha 654 "1: 700 "1: \n" 655 701 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. 656 702 "subs %2, %2, #8 \n" // 8 processed per loop. … … 668 714 asm volatile( 669 715 "vmov.u8 d4, #255 \n" // Alpha 670 "1: 716 "1: \n" 671 717 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. 672 718 "subs %2, %2, #8 \n" // 8 processed per loop. … … 684 730 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 685 731 asm volatile( 686 "1: 732 "1: \n" 687 733 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. 688 734 "subs %2, %2, #8 \n" // 8 processed per loop. … … 714 760 asm volatile( 715 761 "vmov.u8 d3, #255 \n" // Alpha 716 "1: 762 "1: \n" 717 763 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 718 764 "subs %2, %2, #8 \n" // 8 processed per loop. … … 760 806 asm volatile( 761 807 "vmov.u8 d3, #255 \n" // Alpha 762 "1: 808 "1: \n" 763 809 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 764 810 "subs %2, %2, #8 \n" // 8 processed per loop. … … 789 835 asm volatile( 790 836 "vmov.u8 d3, #255 \n" // Alpha 791 "1: 837 "1: \n" 792 838 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 793 839 "subs %2, %2, #8 \n" // 8 processed per loop. … … 805 851 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 806 852 asm volatile( 807 "1: 853 "1: \n" 808 854 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 809 855 "subs %2, %2, #8 \n" // 8 processed per loop. … … 821 867 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 822 868 asm volatile( 823 "1: 869 "1: \n" 824 870 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. 825 871 "subs %2, %2, #8 \n" // 8 processed per loop. … … 837 883 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 838 884 asm volatile( 839 "1: 885 "1: \n" 840 886 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. 841 887 "subs %2, %2, #16 \n" // 16 processed per loop. … … 852 898 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 853 899 asm volatile( 854 "1: 900 "1: \n" 855 901 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. 856 902 "subs %2, %2, #16 \n" // 16 processed per loop. … … 870 916 int width) { 871 917 asm volatile( 872 "1: 918 "1: \n" 873 919 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 874 920 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. … … 890 936 int width) { 891 937 asm volatile( 892 "1: 938 "1: \n" 893 939 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 894 940 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. … … 912 958 asm volatile( 913 959 "add %1, %0, %1 \n" // stride + src_yuy2 914 "1: 960 "1: \n" 915 961 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. 916 962 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. … … 939 985 asm volatile( 940 986 "add %1, %0, %1 \n" // stride + src_uyvy 941 "1: 987 "1: \n" 942 988 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. 943 989 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. … … 966 1012 asm volatile( 967 1013 "vld1.8 {q2}, [%3] \n" // shuffler 968 "1: 1014 "1: \n" 969 1015 "vld1.8 {q0}, [%0]! \n" // load 4 pixels. 970 1016 "subs %2, %2, #4 \n" // 4 processed per loop … … 987 1033 int width) { 988 1034 asm volatile( 989 "1: 1035 "1: \n" 990 1036 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys 991 1037 "vld1.8 {d1}, [%1]! \n" // load 8 Us … … 1009 1055 int width) { 1010 1056 asm volatile( 1011 "1: 1057 "1: \n" 1012 1058 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys 1013 1059 "vld1.8 {d0}, [%1]! \n" // load 8 Us … … 1027 1073 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1028 1074 asm volatile( 1029 "1: 1075 "1: \n" 1030 1076 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1031 1077 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1046 1092 asm volatile( 1047 1093 "vdup.32 d2, %2 \n" // dither4 1048 "1: 1094 "1: \n" 1049 1095 "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. 1050 1096 "subs %3, %3, #8 \n" // 8 processed per loop. 1051 1097 "vqadd.u8 d20, d20, d2 \n" 1052 1098 "vqadd.u8 d21, d21, d2 \n" 1053 "vqadd.u8 d22, d22, d2 \n" ARGBTORGB565 1054 "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. 1099 "vqadd.u8 d22, d22, d2 \n" // add for dither 1100 ARGBTORGB565 1101 "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. 1055 1102 "bgt 1b \n" 1056 1103 : "+r"(dst_rgb) // %0 … … 1065 1112 int width) { 1066 1113 asm volatile( 1067 "1: 1114 "1: \n" 1068 1115 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1069 1116 "subs %2, %2, #8 \n" // 8 processed per loop. 1070 1117 ARGBTOARGB1555 1071 "vst1.8 {q0}, [%1]! \n" // store 8 pixels 1072 // ARGB1555. 1118 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. 1073 1119 "bgt 1b \n" 1074 1120 : "+r"(src_argb), // %0 … … 1085 1131 "vmov.u8 d4, #0x0f \n" // bits to clear with 1086 1132 // vbic. 1087 "1: 1133 "1: \n" 1088 1134 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1089 1135 "subs %2, %2, #8 \n" // 8 processed per loop. 1090 1136 ARGBTOARGB4444 1091 "vst1.8 {q0}, [%1]! \n" // store 8 pixels 1092 // ARGB4444. 1137 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. 1093 1138 "bgt 1b \n" 1094 1139 : "+r"(src_argb), // %0 … … 1105 1150 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1106 1151 "vmov.u8 d27, #16 \n" // Add 16 constant 1107 "1: 1152 "1: \n" 1108 1153 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1109 1154 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1124 1169 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1125 1170 asm volatile( 1126 "1: 1171 "1: \n" 1127 1172 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels 1128 1173 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels … … 1143 1188 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 1144 1189 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 1145 "1: 1190 "1: \n" 1146 1191 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1147 1192 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1172 1217 "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient 1173 1218 "vmov.u16 q15, #0x8080 \n" // 128.5 1174 "1: 1219 "1: \n" 1175 1220 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1176 1221 "subs %3, %3, #8 \n" // 8 processed per loop. … … 1200 1245 } 1201 1246 1247 // clang-format off 1202 1248 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1203 1249 #define RGBTOUV(QB, QG, QR) \ 1204 "vmul.s16 q8, " #QB \ 1205 ", q10 \n" /* B */ \ 1206 "vmls.s16 q8, " #QG \ 1207 ", q11 \n" /* G */ \ 1208 "vmls.s16 q8, " #QR \ 1209 ", q12 \n" /* R */ \ 1250 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ 1251 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ 1252 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ 1210 1253 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ 1211 "vmul.s16 q9, " #QR \ 1212 ", q10 \n" /* R */ \ 1213 "vmls.s16 q9, " #QG \ 1214 ", q14 \n" /* G */ \ 1215 "vmls.s16 q9, " #QB \ 1216 ", q13 \n" /* B */ \ 1254 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ 1255 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ 1256 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ 1217 1257 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ 1218 1258 "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ 1219 1259 "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ 1260 // clang-format on 1220 1261 1221 1262 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. … … 1233 1274 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1234 1275 "vmov.u16 q15, #0x8080 \n" // 128.5 1235 "1:\n"1276 "1: \n" 1236 1277 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1237 1278 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. … … 1279 1320 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient 1280 1321 "vmov.u16 q15, #0x8080 \n" // 128.5 1281 "1:\n"1322 "1: \n" 1282 1323 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1283 1324 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. … … 1324 1365 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1325 1366 "vmov.u16 q15, #0x8080 \n" // 128.5 1326 "1:\n"1367 "1: \n" 1327 1368 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. 1328 1369 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. … … 1369 1410 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1370 1411 "vmov.u16 q15, #0x8080 \n" // 128.5 1371 "1:\n"1412 "1: \n" 1372 1413 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. 1373 1414 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. … … 1414 1455 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1415 1456 "vmov.u16 q15, #0x8080 \n" // 128.5 1416 "1:\n"1457 "1: \n" 1417 1458 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. 1418 1459 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. … … 1459 1500 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1460 1501 "vmov.u16 q15, #0x8080 \n" // 128.5 1461 "1:\n"1502 "1: \n" 1462 1503 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. 1463 1504 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. … … 1504 1545 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1505 1546 "vmov.u16 q15, #0x8080 \n" // 128.5 1506 "1:\n"1547 "1: \n" 1507 1548 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. 1508 1549 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. … … 1551 1592 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1552 1593 "vmov.u16 q15, #0x8080 \n" // 128.5 1553 "1: 1594 "1: \n" 1554 1595 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1555 1596 RGB565TOARGB … … 1617 1658 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1618 1659 "vmov.u16 q15, #0x8080 \n" // 128.5 1619 "1: 1660 "1: \n" 1620 1661 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1621 1662 RGB555TOARGB … … 1683 1724 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1684 1725 "vmov.u16 q15, #0x8080 \n" // 128.5 1685 "1: 1726 "1: \n" 1686 1727 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1687 1728 ARGB4444TOARGB … … 1740 1781 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1741 1782 "vmov.u8 d27, #16 \n" // Add 16 constant 1742 "1: 1783 "1: \n" 1743 1784 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1744 1785 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1764 1805 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1765 1806 "vmov.u8 d27, #16 \n" // Add 16 constant 1766 "1: 1807 "1: \n" 1767 1808 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1768 1809 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1788 1829 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 1789 1830 "vmov.u8 d27, #16 \n" // Add 16 constant 1790 "1: 1831 "1: \n" 1791 1832 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1792 1833 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1812 1853 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1813 1854 "vmov.u8 d7, #16 \n" // Add 16 constant 1814 "1: 1855 "1: \n" 1815 1856 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. 1816 1857 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1835 1876 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1836 1877 "vmov.u8 d7, #16 \n" // Add 16 constant 1837 "1: 1878 "1: \n" 1838 1879 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. 1839 1880 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1858 1899 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 1859 1900 "vmov.u8 d7, #16 \n" // Add 16 constant 1860 "1: 1901 "1: \n" 1861 1902 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. 1862 1903 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1881 1922 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 1882 1923 "vmov.u8 d7, #16 \n" // Add 16 constant 1883 "1: 1924 "1: \n" 1884 1925 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. 1885 1926 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1904 1945 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 1905 1946 "vmov.u8 d7, #16 \n" // Add 16 constant 1906 "1: 1947 "1: \n" 1907 1948 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. 1908 1949 "subs %2, %2, #8 \n" // 8 processed per loop. … … 1939 1980 "vdup.8 d4, %4 \n" 1940 1981 // General purpose row blend. 1941 "1: 1982 "1: \n" 1942 1983 "vld1.8 {q0}, [%1]! \n" 1943 1984 "vld1.8 {q1}, [%2]! \n" … … 1954 1995 1955 1996 // Blend 50 / 50. 1956 "50: 1997 "50: \n" 1957 1998 "vld1.8 {q0}, [%1]! \n" 1958 1999 "vld1.8 {q1}, [%2]! \n" … … 1964 2005 1965 2006 // Blend 100 / 0 - Copy row unchanged. 1966 "100: 2007 "100: \n" 1967 2008 "vld1.8 {q0}, [%1]! \n" 1968 2009 "subs %3, %3, #16 \n" … … 1970 2011 "bgt 100b \n" 1971 2012 1972 "99: 2013 "99: \n" 1973 2014 : "+r"(dst_ptr), // %0 1974 2015 "+r"(src_ptr), // %1 … … 1989 2030 "blt 89f \n" 1990 2031 // Blend 8 pixels. 1991 "8: 2032 "8: \n" 1992 2033 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. 1993 2034 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. … … 2007 2048 "bge 8b \n" 2008 2049 2009 "89: 2050 "89: \n" 2010 2051 "adds %3, #8-1 \n" 2011 2052 "blt 99f \n" 2012 2053 2013 2054 // Blend 1 pixels. 2014 "1: 2055 "1: \n" 2015 2056 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. 2016 2057 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. … … 2044 2085 asm volatile( 2045 2086 // Attenuate 8 pixels. 2046 "1: 2087 "1: \n" 2047 2088 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. 2048 2089 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2076 2117 2077 2118 // 8 pixel loop. 2078 "1: 2119 "1: \n" 2079 2120 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. 2080 2121 "subs %1, %1, #8 \n" // 8 processed per loop. … … 2117 2158 2118 2159 // 8 pixel loop. 2119 "1: 2160 "1: \n" 2120 2161 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. 2121 2162 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2149 2190 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 2150 2191 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 2151 "1: 2192 "1: \n" 2152 2193 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2153 2194 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2182 2223 "vmov.u8 d29, #98 \n" // BG coefficient 2183 2224 "vmov.u8 d30, #50 \n" // BR coefficient 2184 "1: 2225 "1: \n" 2185 2226 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. 2186 2227 "subs %1, %1, #8 \n" // 8 processed per loop. … … 2218 2259 "vmovl.s8 q1, d5 \n" // R,A coefficients s16. 2219 2260 2220 "1: 2261 "1: \n" 2221 2262 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. 2222 2263 "subs %2, %2, #8 \n" // 8 processed per loop. … … 2274 2315 asm volatile( 2275 2316 // 8 pixel loop. 2276 "1: 2317 "1: \n" 2277 2318 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 2278 2319 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB 2279 // pixels.2280 2320 "subs %3, %3, #8 \n" // 8 processed per loop. 2281 2321 "vmull.u8 q0, d0, d1 \n" // multiply B … … 2289 2329 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2290 2330 "bgt 1b \n" 2291 2292 2331 : "+r"(src_argb0), // %0 2293 2332 "+r"(src_argb1), // %1 … … 2305 2344 asm volatile( 2306 2345 // 8 pixel loop. 2307 "1: 2346 "1: \n" 2308 2347 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2309 2348 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB 2310 // pixels.2311 2349 "subs %3, %3, #8 \n" // 8 processed per loop. 2312 2350 "vqadd.u8 q0, q0, q2 \n" // add B, G … … 2314 2352 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2315 2353 "bgt 1b \n" 2316 2317 2354 : "+r"(src_argb0), // %0 2318 2355 "+r"(src_argb1), // %1 … … 2330 2367 asm volatile( 2331 2368 // 8 pixel loop. 2332 "1: 2369 "1: \n" 2333 2370 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2334 2371 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB 2335 // pixels.2336 2372 "subs %3, %3, #8 \n" // 8 processed per loop. 2337 2373 "vqsub.u8 q0, q0, q2 \n" // subtract B, G … … 2339 2375 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. 2340 2376 "bgt 1b \n" 2341 2342 2377 : "+r"(src_argb0), // %0 2343 2378 "+r"(src_argb1), // %1 … … 2360 2395 "vmov.u8 d3, #255 \n" // alpha 2361 2396 // 8 pixel loop. 2362 "1: 2397 "1: \n" 2363 2398 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. 2364 2399 "vld1.8 {d1}, [%1]! \n" // load 8 sobely. … … 2384 2419 asm volatile( 2385 2420 // 16 pixel loop. 2386 "1: 2421 "1: \n" 2387 2422 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. 2388 2423 "vld1.8 {q1}, [%1]! \n" // load 16 sobely. … … 2411 2446 "vmov.u8 d3, #255 \n" // alpha 2412 2447 // 8 pixel loop. 2413 "1: 2448 "1: \n" 2414 2449 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. 2415 2450 "vld1.8 {d0}, [%1]! \n" // load 8 sobely. … … 2436 2471 int width) { 2437 2472 asm volatile( 2438 "1: 2473 "1: \n" 2439 2474 "vld1.8 {d0}, [%0],%5 \n" // top 2440 2475 "vld1.8 {d1}, [%0],%6 \n" … … 2474 2509 int width) { 2475 2510 asm volatile( 2476 "1: 2511 "1: \n" 2477 2512 "vld1.8 {d0}, [%0],%4 \n" // left 2478 2513 "vld1.8 {d1}, [%1],%4 \n" … … 2506 2541 "vdup.32 q0, %3 \n" 2507 2542 2508 "1: 2543 "1: \n" 2509 2544 "vld1.8 {q1}, [%0]! \n" // load 8 shorts 2510 2545 "subs %2, %2, #8 \n" // 8 pixels per loop … … 2531 2566 "vdup.32 q0, %3 \n" 2532 2567 2533 "1: 2568 "1: \n" 2534 2569 "vld1.8 {q1}, [%0]! \n" // load 8 shorts 2535 2570 "subs %2, %2, #8 \n" // 8 pixels per loop -
pjproject/trunk/third_party/yuv/source/row_neon64.cc
r5633 r5699 274 274 asm volatile( 275 275 YUVTORGB_SETUP 276 "1: 276 "1: \n" READYUV422 YUVTORGB( 277 277 v22, v21, 278 278 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 … … 311 311 YUVTORGB_SETUP 312 312 "movi v23.8b, #255 \n" 313 "1: 313 "1: \n" READYUV422 YUVTORGB( 314 314 v22, v21, 315 315 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 … … 396 396 asm volatile( 397 397 "movi v23.8b, #255 \n" 398 "1: 398 "1: \n" 399 399 "ld1 {v20.8b}, [%0], #8 \n" 400 400 "orr v21.8b, v20.8b, v20.8b \n" … … 471 471 asm volatile( 472 472 YUVTORGB_SETUP 473 "1: 473 "1: \n" READNV12 YUVTORGB( 474 474 v22, v21, 475 475 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 … … 545 545 int width) { 546 546 asm volatile( 547 "1: 547 "1: \n" 548 548 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 549 549 "subs %w3, %w3, #16 \n" // 16 processed per loop … … 566 566 int width) { 567 567 asm volatile( 568 "1: 568 "1: \n" 569 569 "ld1 {v0.16b}, [%0], #16 \n" // load U 570 570 "ld1 {v1.16b}, [%1], #16 \n" // load V … … 581 581 } 582 582 583 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 583 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. 584 void SplitRGBRow_NEON(const uint8* src_rgb, 585 uint8* dst_r, 586 uint8* dst_g, 587 uint8* dst_b, 588 int width) { 589 asm volatile( 590 "1: \n" 591 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB 592 "subs %w4, %w4, #16 \n" // 16 processed per loop 593 "st1 {v0.16b}, [%1], #16 \n" // store R 594 "st1 {v1.16b}, [%2], #16 \n" // store G 595 "st1 {v2.16b}, [%3], #16 \n" // store B 596 "b.gt 1b \n" 597 : "+r"(src_rgb), // %0 598 "+r"(dst_r), // %1 599 "+r"(dst_g), // %2 600 "+r"(dst_b), // %3 601 "+r"(width) // %4 602 : // Input registers 603 : "cc", "memory", "v0", "v1", "v2" // Clobber List 604 ); 605 } 606 607 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time 608 void MergeRGBRow_NEON(const uint8* src_r, 609 const uint8* src_g, 610 const uint8* src_b, 611 uint8* dst_rgb, 612 int width) { 613 asm volatile( 614 "1: \n" 615 "ld1 {v0.16b}, [%0], #16 \n" // load R 616 "ld1 {v1.16b}, [%1], #16 \n" // load G 617 "ld1 {v2.16b}, [%2], #16 \n" // load B 618 "subs %w4, %w4, #16 \n" // 16 processed per loop 619 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB 620 "b.gt 1b \n" 621 : "+r"(src_r), // %0 622 "+r"(src_g), // %1 623 "+r"(src_b), // %2 624 "+r"(dst_rgb), // %3 625 "+r"(width) // %4 626 : // Input registers 627 : "cc", "memory", "v0", "v1", "v2" // Clobber List 628 ); 629 } 630 631 // Copy multiple of 32. 584 632 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 585 633 asm volatile( 586 "1: 587 "ld 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32634 "1: \n" 635 "ldp q0, q1, [%0], #32 \n" 588 636 "subs %w2, %w2, #32 \n" // 32 processed per loop 589 "st 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32590 "b.gt 1b \n" 591 : "+r"(src), 592 "+r"(dst), 593 "+r"(count) 594 : 595 : "cc", "memory", "v0", "v1" , "v2", "v3"// Clobber List637 "stp q0, q1, [%1], #32 \n" 638 "b.gt 1b \n" 639 : "+r"(src), // %0 640 "+r"(dst), // %1 641 "+r"(count) // %2 // Output registers 642 : // Input registers 643 : "cc", "memory", "v0", "v1" // Clobber List 596 644 ); 597 645 } … … 601 649 asm volatile( 602 650 "dup v0.16b, %w2 \n" // duplicate 16 bytes 603 "1: 651 "1: \n" 604 652 "subs %w1, %w1, #16 \n" // 16 bytes per loop 605 653 "st1 {v0.16b}, [%0], #16 \n" // store … … 614 662 asm volatile( 615 663 "dup v0.4s, %w2 \n" // duplicate 4 ints 616 "1: 664 "1: \n" 617 665 "subs %w1, %w1, #4 \n" // 4 ints per loop 618 666 "st1 {v0.16b}, [%0], #16 \n" // store … … 629 677 "add %0, %0, %w2, sxtw \n" 630 678 "sub %0, %0, #16 \n" 631 "1: 679 "1: \n" 632 680 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 633 681 "subs %w2, %w2, #16 \n" // 16 pixels per loop. … … 651 699 "add %0, %0, %w3, sxtw #1 \n" 652 700 "sub %0, %0, #16 \n" 653 "1: 701 "1: \n" 654 702 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 655 703 "subs %w3, %w3, #8 \n" // 8 pixels per loop. … … 672 720 "add %0, %0, %w2, sxtw #2 \n" 673 721 "sub %0, %0, #16 \n" 674 "1: 722 "1: \n" 675 723 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 676 724 "subs %w2, %w2, #4 \n" // 4 pixels per loop. … … 689 737 asm volatile( 690 738 "movi v4.8b, #255 \n" // Alpha 691 "1: 739 "1: \n" 692 740 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 693 741 "subs %w2, %w2, #8 \n" // 8 processed per loop. 694 742 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB 695 // pixels696 743 "b.gt 1b \n" 697 744 : "+r"(src_rgb24), // %0 … … 706 753 asm volatile( 707 754 "movi v5.8b, #255 \n" // Alpha 708 "1: 755 "1: \n" 709 756 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 710 757 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 723 770 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 724 771 asm volatile( 725 "1: 772 "1: \n" 726 773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 727 774 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 754 801 asm volatile( 755 802 "movi v3.8b, #255 \n" // Alpha 756 "1: 803 "1: \n" 757 804 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 758 805 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 806 RGB565TOARGB 760 807 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 761 // pixels762 808 "b.gt 1b \n" 763 809 : "+r"(src_rgb565), // %0 … … 811 857 asm volatile( 812 858 "movi v3.8b, #255 \n" // Alpha 813 "1: 859 "1: \n" 814 860 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 815 861 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 842 888 int width) { 843 889 asm volatile( 844 "1: 890 "1: \n" 845 891 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 846 892 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 859 905 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 860 906 asm volatile( 861 "1: 907 "1: \n" 862 908 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB 863 // pixels864 909 "subs %w2, %w2, #8 \n" // 8 processed per loop. 865 910 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of … … 876 921 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 877 922 asm volatile( 878 "1: 923 "1: \n" 879 924 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 880 925 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 893 938 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 894 939 asm volatile( 895 "1: 940 "1: \n" 896 941 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 897 942 "subs %w2, %w2, #16 \n" // 16 processed per loop. … … 908 953 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 909 954 asm volatile( 910 "1: 955 "1: \n" 911 956 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 912 957 "subs %w2, %w2, #16 \n" // 16 processed per loop. … … 926 971 int width) { 927 972 asm volatile( 928 "1: 973 "1: \n" 929 974 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 930 // pixels931 975 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 932 976 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. … … 947 991 int width) { 948 992 asm volatile( 949 "1: 993 "1: \n" 950 994 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY 951 // pixels952 995 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 953 996 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. … … 970 1013 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 971 1014 asm volatile( 972 "1: 1015 "1: \n" 973 1016 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 974 1017 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. … … 997 1040 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 998 1041 asm volatile( 999 "1: 1042 "1: \n" 1000 1043 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1001 1044 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. … … 1024 1067 asm volatile( 1025 1068 "ld1 {v2.16b}, [%3] \n" // shuffler 1026 "1: 1069 "1: \n" 1027 1070 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1028 1071 "subs %w2, %w2, #4 \n" // 4 processed per loop … … 1044 1087 int width) { 1045 1088 asm volatile( 1046 "1: 1089 "1: \n" 1047 1090 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1048 1091 "orr v2.8b, v1.8b, v1.8b \n" … … 1067 1110 int width) { 1068 1111 asm volatile( 1069 "1: 1112 "1: \n" 1070 1113 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1071 1114 "orr v3.8b, v2.8b, v2.8b \n" … … 1086 1129 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1087 1130 asm volatile( 1088 "1: 1131 "1: \n" 1089 1132 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1090 1133 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1105 1148 asm volatile( 1106 1149 "dup v1.4s, %w2 \n" // dither4 1107 "1: 1150 "1: \n" 1108 1151 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1109 1152 "subs %w3, %w3, #8 \n" // 8 processed per loop. … … 1124 1167 int width) { 1125 1168 asm volatile( 1126 "1: 1169 "1: \n" 1127 1170 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1128 1171 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1144 1187 "movi v4.16b, #0x0f \n" // bits to clear with 1145 1188 // vbic. 1146 "1: 1189 "1: \n" 1147 1190 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1148 1191 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1164 1207 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1165 1208 "movi v7.8b, #16 \n" // Add 16 constant 1166 "1: 1209 "1: \n" 1167 1210 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1168 // pixels.1169 1211 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1170 1212 "umull v3.8h, v0.8b, v4.8b \n" // B … … 1184 1226 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1185 1227 asm volatile( 1186 "1: 1228 "1: \n" 1187 1229 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 1188 1230 // pixels … … 1203 1245 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1204 1246 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1205 "1: 1247 "1: \n" 1206 1248 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1207 // pixels.1208 1249 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1209 1250 "umull v3.8h, v0.8b, v4.8b \n" // B … … 1233 1274 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1234 1275 "movi v29.16b,#0x80 \n" // 128.5 1235 "1: 1276 "1: \n" 1236 1277 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1237 1278 // pixels. … … 1271 1312 1272 1313 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1314 // clang-format off 1273 1315 #define RGBTOUV(QB, QG, QR) \ 1274 "mul v3.8h, " #QB \ 1275 ",v20.8h \n" /* B */ \ 1276 "mul v4.8h, " #QR \ 1277 ",v20.8h \n" /* R */ \ 1278 "mls v3.8h, " #QG \ 1279 ",v21.8h \n" /* G */ \ 1280 "mls v4.8h, " #QG \ 1281 ",v24.8h \n" /* G */ \ 1282 "mls v3.8h, " #QR \ 1283 ",v22.8h \n" /* R */ \ 1284 "mls v4.8h, " #QB \ 1285 ",v23.8h \n" /* B */ \ 1316 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1317 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1318 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1319 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1320 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1321 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1286 1322 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1287 1323 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1288 1324 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1289 1325 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1326 // clang-format on 1290 1327 1291 1328 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. … … 1579 1616 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1580 1617 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1581 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 1582 // 16-bit) 1583 "1: \n" 1618 "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit 1619 "1: \n" 1584 1620 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1585 1621 RGB565TOARGB … … 1646 1682 asm volatile( 1647 1683 RGBTOUV_SETUP_REG 1648 "1: 1684 "1: \n" 1649 1685 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1650 1686 RGB555TOARGB … … 1711 1747 asm volatile( 1712 1748 RGBTOUV_SETUP_REG 1713 "1: 1749 "1: \n" 1714 1750 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1715 1751 ARGB4444TOARGB … … 1775 1811 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1776 1812 "movi v27.8b, #16 \n" // Add 16 constant 1777 "1: 1813 "1: \n" 1778 1814 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1779 1815 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1800 1836 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1801 1837 "movi v7.8b, #16 \n" // Add 16 constant 1802 "1: 1838 "1: \n" 1803 1839 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1804 1840 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1824 1860 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1825 1861 "movi v27.8b, #16 \n" // Add 16 constant 1826 "1: 1862 "1: \n" 1827 1863 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1828 1864 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1848 1884 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1849 1885 "movi v7.8b, #16 \n" // Add 16 constant 1850 "1: 1886 "1: \n" 1851 1887 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1852 1888 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1871 1907 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1872 1908 "movi v7.8b, #16 \n" // Add 16 constant 1873 "1: 1909 "1: \n" 1874 1910 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1875 1911 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1894 1930 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1895 1931 "movi v7.8b, #16 \n" // Add 16 constant 1896 "1: 1932 "1: \n" 1897 1933 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1898 1934 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1917 1953 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1918 1954 "movi v7.8b, #16 \n" // Add 16 constant 1919 "1: 1955 "1: \n" 1920 1956 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1921 1957 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1940 1976 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1941 1977 "movi v7.8b, #16 \n" // Add 16 constant 1942 "1: 1978 "1: \n" 1943 1979 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1944 1980 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1975 2011 "dup v4.16b, %w5 \n" 1976 2012 // General purpose row blend. 1977 "1: 2013 "1: \n" 1978 2014 "ld1 {v0.16b}, [%1], #16 \n" 1979 2015 "ld1 {v1.16b}, [%2], #16 \n" … … 1990 2026 1991 2027 // Blend 50 / 50. 1992 "50: 2028 "50: \n" 1993 2029 "ld1 {v0.16b}, [%1], #16 \n" 1994 2030 "ld1 {v1.16b}, [%2], #16 \n" … … 2000 2036 2001 2037 // Blend 100 / 0 - Copy row unchanged. 2002 "100: 2038 "100: \n" 2003 2039 "ld1 {v0.16b}, [%1], #16 \n" 2004 2040 "subs %w3, %w3, #16 \n" … … 2006 2042 "b.gt 100b \n" 2007 2043 2008 "99: 2044 "99: \n" 2009 2045 : "+r"(dst_ptr), // %0 2010 2046 "+r"(src_ptr), // %1 … … 2026 2062 "b.lt 89f \n" 2027 2063 // Blend 8 pixels. 2028 "8: 2064 "8: \n" 2029 2065 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 2030 2066 // pixels … … 2049 2085 "b.ge 8b \n" 2050 2086 2051 "89: 2087 "89: \n" 2052 2088 "adds %w3, %w3, #8-1 \n" 2053 2089 "b.lt 99f \n" 2054 2090 2055 2091 // Blend 1 pixels. 2056 "1: 2092 "1: \n" 2057 2093 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2058 2094 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. … … 2074 2110 "b.ge 1b \n" 2075 2111 2076 "99: 2112 "99: \n" 2077 2113 2078 2114 : "+r"(src_argb0), // %0 … … 2089 2125 asm volatile( 2090 2126 // Attenuate 8 pixels. 2091 "1: 2127 "1: \n" 2092 2128 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2093 // pixels2094 2129 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2095 2130 "umull v4.8h, v0.8b, v3.8b \n" // b * a … … 2123 2158 2124 2159 // 8 pixel loop. 2125 "1: \n" 2126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of 2127 // ARGB. 2160 "1: \n" 2161 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. 2128 2162 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2129 2163 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) … … 2143 2177 "uqxtn v2.8b, v2.8h \n" 2144 2178 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB 2145 // pixels2146 2179 "b.gt 1b \n" 2147 2180 : "+r"(dst_argb), // %0 … … 2166 2199 2167 2200 // 8 pixel loop. 2168 "1: 2201 "1: \n" 2169 2202 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB 2170 // pixels.2171 2203 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2172 2204 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) … … 2183 2215 "uqxtn v7.8b, v7.8h \n" 2184 2216 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB 2185 // pixels2186 2217 "b.gt 1b \n" 2187 2218 : "+r"(src_argb), // %0 … … 2200 2231 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2201 2232 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2202 "1: 2233 "1: \n" 2203 2234 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2204 // pixels.2205 2235 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2206 2236 "umull v4.8h, v0.8b, v24.8b \n" // B … … 2235 2265 "movi v29.8b, #98 \n" // BG coefficient 2236 2266 "movi v30.8b, #50 \n" // BR coefficient 2237 "1: 2267 "1: \n" 2238 2268 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2239 2269 "subs %w1, %w1, #8 \n" // 8 processed per loop. … … 2271 2301 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2272 2302 2273 "1: \n" 2274 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 2275 // pixels. 2303 "1: \n" 2304 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB 2276 2305 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2277 2306 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit … … 2311 2340 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2312 2341 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2313 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 2314 // pixels. 2342 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB 2315 2343 "b.gt 1b \n" 2316 2344 : "+r"(src_argb), // %0 … … 2330 2358 asm volatile( 2331 2359 // 8 pixel loop. 2332 "1: 2360 "1: \n" 2333 2361 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2334 // pixels.2335 2362 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2336 // pixels.2337 2363 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2338 2364 "umull v0.8h, v0.8b, v4.8b \n" // multiply B … … 2345 2371 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2346 2372 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2347 // pixels 2348 "b.gt 1b \n" 2349 2373 "b.gt 1b \n" 2350 2374 : "+r"(src_argb0), // %0 2351 2375 "+r"(src_argb1), // %1 … … 2363 2387 asm volatile( 2364 2388 // 8 pixel loop. 2365 "1: 2389 "1: \n" 2366 2390 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2367 // pixels.2368 2391 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2369 // pixels.2370 2392 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2371 2393 "uqadd v0.8b, v0.8b, v4.8b \n" … … 2374 2396 "uqadd v3.8b, v3.8b, v7.8b \n" 2375 2397 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2376 // pixels 2377 "b.gt 1b \n" 2378 2398 "b.gt 1b \n" 2379 2399 : "+r"(src_argb0), // %0 2380 2400 "+r"(src_argb1), // %1 … … 2392 2412 asm volatile( 2393 2413 // 8 pixel loop. 2394 "1: 2414 "1: \n" 2395 2415 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2396 // pixels.2397 2416 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2398 // pixels.2399 2417 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2400 2418 "uqsub v0.8b, v0.8b, v4.8b \n" … … 2403 2421 "uqsub v3.8b, v3.8b, v7.8b \n" 2404 2422 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2405 // pixels 2406 "b.gt 1b \n" 2407 2423 "b.gt 1b \n" 2408 2424 : "+r"(src_argb0), // %0 2409 2425 "+r"(src_argb1), // %1 … … 2426 2442 "movi v3.8b, #255 \n" // alpha 2427 2443 // 8 pixel loop. 2428 "1: 2444 "1: \n" 2429 2445 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2430 2446 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. … … 2434 2450 "orr v2.8b, v0.8b, v0.8b \n" 2435 2451 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2436 // pixels2437 2452 "b.gt 1b \n" 2438 2453 : "+r"(src_sobelx), // %0 … … 2451 2466 asm volatile( 2452 2467 // 16 pixel loop. 2453 "1: 2468 "1: \n" 2454 2469 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2455 2470 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. … … 2478 2493 "movi v3.8b, #255 \n" // alpha 2479 2494 // 8 pixel loop. 2480 "1: 2495 "1: \n" 2481 2496 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2482 2497 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. … … 2484 2499 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2485 2500 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2486 // pixels2487 2501 "b.gt 1b \n" 2488 2502 : "+r"(src_sobelx), // %0 … … 2504 2518 int width) { 2505 2519 asm volatile( 2506 "1: 2520 "1: \n" 2507 2521 "ld1 {v0.8b}, [%0],%5 \n" // top 2508 2522 "ld1 {v1.8b}, [%0],%6 \n" … … 2542 2556 int width) { 2543 2557 asm volatile( 2544 "1: 2558 "1: \n" 2545 2559 "ld1 {v0.8b}, [%0],%4 \n" // left 2546 2560 "ld1 {v1.8b}, [%1],%4 \n" … … 2573 2587 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { 2574 2588 asm volatile( 2575 "1: 2589 "1: \n" 2576 2590 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2577 2591 "subs %w2, %w2, #8 \n" // 8 pixels per loop … … 2593 2607 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { 2594 2608 asm volatile( 2595 "1: 2609 "1: \n" 2596 2610 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2597 2611 "subs %w2, %w2, #8 \n" // 8 pixels per loop … … 2613 2627 } 2614 2628 2629 float ScaleMaxSamples_NEON(const float* src, 2630 float* dst, 2631 float scale, 2632 int width) { 2633 float fmax; 2634 asm volatile( 2635 "movi v5.4s, #0 \n" // max 2636 "movi v6.4s, #0 \n" 2637 2638 "1: \n" 2639 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2640 "subs %w2, %w2, #8 \n" // 8 processed per loop 2641 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale 2642 "fmul v4.4s, v2.4s, %4.s[0] \n" // scale 2643 "fmax v5.4s, v5.4s, v1.4s \n" // max 2644 "fmax v6.4s, v6.4s, v2.4s \n" 2645 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples 2646 "b.gt 1b \n" 2647 "fmax v5.4s, v5.4s, v6.4s \n" // max 2648 "fmaxv %s3, v5.4s \n" // signed max acculator 2649 : "+r"(src), // %0 2650 "+r"(dst), // %1 2651 "+r"(width), // %2 2652 "=w"(fmax) // %3 2653 : "w"(scale) // %4 2654 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); 2655 return fmax; 2656 } 2657 2658 float ScaleSumSamples_NEON(const float* src, 2659 float* dst, 2660 float scale, 2661 int width) { 2662 float fsum; 2663 asm volatile( 2664 "movi v5.4s, #0 \n" // max 2665 "movi v6.4s, #0 \n" // max 2666 2667 "1: \n" 2668 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2669 "subs %w2, %w2, #8 \n" // 8 processed per loop 2670 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale 2671 "fmul v4.4s, v2.4s, %4.s[0] \n" 2672 "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares 2673 "fmla v6.4s, v2.4s, v2.4s \n" 2674 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples 2675 "b.gt 1b \n" 2676 "faddp v5.4s, v5.4s, v6.4s \n" 2677 "faddp v5.4s, v5.4s, v5.4s \n" 2678 "faddp %3.4s, v5.4s, v5.4s \n" // sum 2679 : "+r"(src), // %0 2680 "+r"(dst), // %1 2681 "+r"(width), // %2 2682 "=w"(fsum) // %3 2683 : "w"(scale) // %4 2684 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); 2685 return fsum; 2686 } 2687 2688 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { 2689 asm volatile( 2690 "1: \n" 2691 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2692 "subs %w2, %w2, #8 \n" // 8 processed per loop 2693 "fmul v1.4s, v1.4s, %3.s[0] \n" // scale 2694 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale 2695 "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples 2696 "b.gt 1b \n" 2697 : "+r"(src), // %0 2698 "+r"(dst), // %1 2699 "+r"(width) // %2 2700 : "w"(scale) // %3 2701 : "cc", "memory", "v1", "v2"); 2702 } 2703 2704 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2705 void GaussCol_NEON(const uint16* src0, 2706 const uint16* src1, 2707 const uint16* src2, 2708 const uint16* src3, 2709 const uint16* src4, 2710 uint32* dst, 2711 int width) { 2712 asm volatile( 2713 "movi v6.8h, #4 \n" // constant 4 2714 "movi v7.8h, #6 \n" // constant 6 2715 2716 "1: \n" 2717 "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows 2718 "ld1 {v2.8h}, [%4], #16 \n" 2719 "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 2720 "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 2721 "ld1 {v2.8h}, [%1], #16 \n" 2722 "umlal v0.4s, v2.4h, v6.4h \n" // * 4 2723 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 2724 "ld1 {v2.8h}, [%2], #16 \n" 2725 "umlal v0.4s, v2.4h, v7.4h \n" // * 6 2726 "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 2727 "ld1 {v2.8h}, [%3], #16 \n" 2728 "umlal v0.4s, v2.4h, v6.4h \n" // * 4 2729 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 2730 "subs %w6, %w6, #8 \n" // 8 processed per loop 2731 "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples 2732 "b.gt 1b \n" 2733 : "+r"(src0), // %0 2734 "+r"(src1), // %1 2735 "+r"(src2), // %2 2736 "+r"(src3), // %3 2737 "+r"(src4), // %4 2738 "+r"(dst), // %5 2739 "+r"(width) // %6 2740 : 2741 : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); 2742 } 2743 2744 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2745 void GaussRow_NEON(const uint32* src, uint16* dst, int width) { 2746 const uint32* src1 = src + 1; 2747 const uint32* src2 = src + 2; 2748 const uint32* src3 = src + 3; 2749 asm volatile( 2750 "movi v6.4s, #4 \n" // constant 4 2751 "movi v7.4s, #6 \n" // constant 6 2752 2753 "1: \n" 2754 "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples 2755 "add v0.4s, v0.4s, v1.4s \n" // * 1 2756 "add v1.4s, v1.4s, v2.4s \n" // * 1 2757 "ld1 {v2.4s,v3.4s}, [%2], #32 \n" 2758 "mla v0.4s, v2.4s, v7.4s \n" // * 6 2759 "mla v1.4s, v3.4s, v7.4s \n" // * 6 2760 "ld1 {v2.4s,v3.4s}, [%1], #32 \n" 2761 "ld1 {v4.4s,v5.4s}, [%3], #32 \n" 2762 "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 2763 "add v3.4s, v3.4s, v5.4s \n" 2764 "mla v0.4s, v2.4s, v6.4s \n" // * 4 2765 "mla v1.4s, v3.4s, v6.4s \n" // * 4 2766 "subs %w5, %w5, #8 \n" // 8 processed per loop 2767 "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack 2768 "uqrshrn2 v0.8h, v1.4s, #8 \n" 2769 "st1 {v0.8h}, [%4], #16 \n" // store 8 samples 2770 "b.gt 1b \n" 2771 : "+r"(src), // %0 2772 "+r"(src1), // %1 2773 "+r"(src2), // %2 2774 "+r"(src3), // %3 2775 "+r"(dst), // %4 2776 "+r"(width) // %5 2777 : "r"(32LL) // %6 2778 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2779 } 2780 2615 2781 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2616 2782 -
pjproject/trunk/third_party/yuv/source/row_win.cc
r5633 r5699 1411 1411 pavgb xmm2, xmm4 1412 1412 1413 // step 2 - convert to U and V1414 // from here down is very similar to Y code except1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V1413 // step 2 - convert to U and V 1414 // from here down is very similar to Y code except 1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1416 1416 movdqa xmm1, xmm0 1417 1417 movdqa xmm3, xmm2 … … 1427 1427 paddb xmm0, xmm5 // -> unsigned 1428 1428 1429 // step 3 - store 8 U and 8 V values1429 // step 3 - store 8 U and 8 V values 1430 1430 movlps qword ptr [edx], xmm0 // U 1431 1431 movhps qword ptr [edx + edi], xmm0 // V … … 1483 1483 pavgb xmm2, xmm4 1484 1484 1485 // step 2 - convert to U and V1486 // from here down is very similar to Y code except1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V1485 // step 2 - convert to U and V 1486 // from here down is very similar to Y code except 1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1488 1488 movdqa xmm1, xmm0 1489 1489 movdqa xmm3, xmm2 … … 1500 1500 packsswb xmm0, xmm1 1501 1501 1502 // step 3 - store 8 U and 8 V values1502 // step 3 - store 8 U and 8 V values 1503 1503 movlps qword ptr [edx], xmm0 // U 1504 1504 movhps qword ptr [edx + edi], xmm0 // V … … 1550 1550 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1551 1551 1552 // step 2 - convert to U and V1553 // from here down is very similar to Y code except1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V1552 // step 2 - convert to U and V 1553 // from here down is very similar to Y code except 1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1555 1555 vpmaddubsw ymm1, ymm0, ymm7 // U 1556 1556 vpmaddubsw ymm3, ymm2, ymm7 … … 1566 1566 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1567 1567 1568 // step 3 - store 16 U and 16 V values1568 // step 3 - store 16 U and 16 V values 1569 1569 vextractf128 [edx], ymm0, 0 // U 1570 1570 vextractf128 [edx + edi], ymm0, 1 // V … … 1618 1618 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1619 1619 1620 // step 2 - convert to U and V1621 // from here down is very similar to Y code except1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V1620 // step 2 - convert to U and V 1621 // from here down is very similar to Y code except 1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1623 1623 vpmaddubsw ymm1, ymm0, ymm7 // U 1624 1624 vpmaddubsw ymm3, ymm2, ymm7 … … 1635 1635 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw 1636 1636 1637 // step 3 - store 16 U and 16 V values1637 // step 3 - store 16 U and 16 V values 1638 1638 vextractf128 [edx], ymm0, 0 // U 1639 1639 vextractf128 [edx + edi], ymm0, 1 // V … … 1751 1751 pavgb xmm2, xmm4 1752 1752 1753 // step 2 - convert to U and V1754 // from here down is very similar to Y code except1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V1753 // step 2 - convert to U and V 1754 // from here down is very similar to Y code except 1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1756 1756 movdqa xmm1, xmm0 1757 1757 movdqa xmm3, xmm2 … … 1767 1767 paddb xmm0, xmm5 // -> unsigned 1768 1768 1769 // step 3 - store 8 U and 8 V values1769 // step 3 - store 8 U and 8 V values 1770 1770 movlps qword ptr [edx], xmm0 // U 1771 1771 movhps qword ptr [edx + edi], xmm0 // V … … 1823 1823 pavgb xmm2, xmm4 1824 1824 1825 // step 2 - convert to U and V1826 // from here down is very similar to Y code except1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V1825 // step 2 - convert to U and V 1826 // from here down is very similar to Y code except 1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1828 1828 movdqa xmm1, xmm0 1829 1829 movdqa xmm3, xmm2 … … 1839 1839 paddb xmm0, xmm5 // -> unsigned 1840 1840 1841 // step 3 - store 8 U and 8 V values1841 // step 3 - store 8 U and 8 V values 1842 1842 movlps qword ptr [edx], xmm0 // U 1843 1843 movhps qword ptr [edx + edi], xmm0 // V … … 1895 1895 pavgb xmm2, xmm4 1896 1896 1897 // step 2 - convert to U and V1898 // from here down is very similar to Y code except1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V1897 // step 2 - convert to U and V 1898 // from here down is very similar to Y code except 1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1900 1900 movdqa xmm1, xmm0 1901 1901 movdqa xmm3, xmm2 … … 1911 1911 paddb xmm0, xmm5 // -> unsigned 1912 1912 1913 // step 3 - store 8 U and 8 V values1913 // step 3 - store 8 U and 8 V values 1914 1914 movlps qword ptr [edx], xmm0 // U 1915 1915 movhps qword ptr [edx + edi], xmm0 // V … … 2928 2928 packuswb xmm0, xmm0 // G 2929 2929 2930 // Step 2: Weave into ARGB2930 // Step 2: Weave into ARGB 2931 2931 punpcklbw xmm0, xmm0 // GG 2932 2932 movdqa xmm1, xmm0 … … 2976 2976 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 2977 2977 2978 // TODO(fbarchard): Weave alpha with unpack.2979 // Step 2: Weave into ARGB2978 // TODO(fbarchard): Weave alpha with unpack. 2979 // Step 2: Weave into ARGB 2980 2980 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 2981 2981 vpermq ymm1, ymm1, 0xd8 … … 4068 4068 sub edi, esi 4069 4069 4070 // 8 pixel loop.4070 // 8 pixel loop. 4071 4071 convertloop8: 4072 4072 movq xmm0, qword ptr [esi] // alpha … … 4124 4124 sub edi, esi 4125 4125 4126 // 32 pixel loop.4126 // 32 pixel loop. 4127 4127 convertloop32: 4128 4128 vmovdqu ymm0, [esi] // alpha … … 4184 4184 jl convertloop4b // less than 4 pixels? 4185 4185 4186 // 4 pixel loop.4186 // 4 pixel loop. 4187 4187 convertloop4: 4188 4188 movdqu xmm3, [eax] // src argb … … 4213 4213 jl convertloop1b 4214 4214 4215 // 1 pixel loop.4215 // 1 pixel loop. 4216 4216 convertloop1: 4217 4217 movd xmm3, [eax] // src argb … … 5257 5257 packssdw xmm5, xmm5 // 16 bit shorts 5258 5258 5259 // 4 pixel loop small blocks.5259 // 4 pixel loop small blocks. 5260 5260 s4: 5261 5261 // top left … … 5299 5299 jmp l4b 5300 5300 5301 // 4 pixel loop5301 // 4 pixel loop 5302 5302 l4: 5303 5303 // top left … … 5351 5351 jl l1b 5352 5352 5353 // 1 pixel loop5353 // 1 pixel loop 5354 5354 l1: 5355 5355 movdqu xmm0, [eax] … … 5393 5393 jne l4b 5394 5394 5395 // 4 pixel loop5395 // 4 pixel loop 5396 5396 l4: 5397 5397 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. … … 5439 5439 jl l1b 5440 5440 5441 // 1 pixel loop5441 // 1 pixel loop 5442 5442 l1: 5443 5443 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. … … 5482 5482 jl l4b 5483 5483 5484 // setup for 4 pixel loop5484 // setup for 4 pixel loop 5485 5485 pshufd xmm7, xmm7, 0x44 // dup dudv 5486 5486 pshufd xmm5, xmm5, 0 // dup 4, stride … … 5494 5494 addps xmm4, xmm4 // dudv *= 4 5495 5495 5496 // 4 pixel loop5496 // 4 pixel loop 5497 5497 l4: 5498 5498 cvttps2dq xmm0, xmm2 // x, y float to int first 2 … … 5525 5525 jl l1b 5526 5526 5527 // 1 pixel loop5527 // 1 pixel loop 5528 5528 l1: 5529 5529 cvttps2dq xmm0, xmm2 // x, y float to int … … 5599 5599 jmp xloop99 5600 5600 5601 // Blend 50 / 50.5601 // Blend 50 / 50. 5602 5602 xloop50: 5603 5603 vmovdqu ymm0, [esi] … … 5609 5609 jmp xloop99 5610 5610 5611 // Blend 100 / 0 - Copy row unchanged.5611 // Blend 100 / 0 - Copy row unchanged. 5612 5612 xloop100: 5613 5613 rep movsb … … 5639 5639 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5640 5640 sub edi, esi 5641 // Dispatch to specialized filters if applicable.5641 // Dispatch to specialized filters if applicable. 5642 5642 cmp eax, 0 5643 5643 je xloop100 // 0 /256. Blend 100 / 0. … … 5679 5679 jmp xloop99 5680 5680 5681 // Blend 50 / 50.5681 // Blend 50 / 50. 5682 5682 xloop50: 5683 5683 movdqu xmm0, [esi] … … 5690 5690 jmp xloop99 5691 5691 5692 // Blend 100 / 0 - Copy row unchanged.5692 // Blend 100 / 0 - Copy row unchanged. 5693 5693 xloop100: 5694 5694 movdqu xmm0, [esi] … … 5785 5785 je shuf_2103 5786 5786 5787 // TODO(fbarchard): Use one source pointer and 3 offsets.5787 // TODO(fbarchard): Use one source pointer and 3 offsets. 5788 5788 shuf_any1: 5789 5789 movzx ebx, byte ptr [esi] … … 5972 5972 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 5973 5973 5974 // 2 pixel loop.5974 // 2 pixel loop. 5975 5975 convertloop: 5976 5976 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel … … 6073 6073 sub edx, eax 6074 6074 6075 // 8 pixel loop.6075 // 8 pixel loop. 6076 6076 convertloop: 6077 6077 movdqu xmm2, xmmword ptr [eax] // 8 shorts … … 6111 6111 sub edx, eax 6112 6112 6113 // 16 pixel loop.6113 // 16 pixel loop. 6114 6114 convertloop: 6115 6115 vmovdqu ymm2, [eax] // 16 shorts … … 6145 6145 sub edx, eax 6146 6146 6147 // 16 pixel loop.6147 // 16 pixel loop. 6148 6148 convertloop: 6149 6149 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints … … 6253 6253 pxor xmm5, xmm5 6254 6254 6255 // 4 pixel loop.6255 // 4 pixel loop. 6256 6256 convertloop: 6257 6257 movdqu xmm0, xmmword ptr [eax] // generate luma ptr -
pjproject/trunk/third_party/yuv/source/scale.cc
r5633 r5699 372 372 } 373 373 #endif 374 #if defined(HAS_SCALEROWDOWN34_MSA) 375 if (TestCpuFlag(kCpuHasMSA)) { 376 if (!filtering) { 377 ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; 378 ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; 379 } else { 380 ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; 381 ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; 382 } 383 if (dst_width % 48 == 0) { 384 if (!filtering) { 385 ScaleRowDown34_0 = ScaleRowDown34_MSA; 386 ScaleRowDown34_1 = ScaleRowDown34_MSA; 387 } else { 388 ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; 389 ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; 390 } 391 } 392 } 393 #endif 374 394 #if defined(HAS_SCALEROWDOWN34_SSSE3) 375 395 if (TestCpuFlag(kCpuHasSSSE3)) { … … 803 823 int boxheight, 804 824 int x, 805 int ,825 int dx, 806 826 const uint16* src_ptr, 807 827 uint8* dst_ptr) { 808 828 int scaleval = 65536 / boxheight; 809 829 int i; 830 (void)dx; 810 831 src_ptr += (x >> 16); 811 832 for (i = 0; i < dst_width; ++i) { … … 1080 1101 } 1081 1102 #endif 1103 #if defined(HAS_SCALEFILTERCOLS_MSA) 1104 if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { 1105 ScaleFilterCols = ScaleFilterCols_Any_MSA; 1106 if (IS_ALIGNED(dst_width, 16)) { 1107 ScaleFilterCols = ScaleFilterCols_MSA; 1108 } 1109 } 1110 #endif 1082 1111 if (y > max_y) { 1083 1112 y = max_y; … … 1278 1307 } 1279 1308 #endif 1309 #if defined(HAS_SCALEFILTERCOLS_MSA) 1310 if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { 1311 ScaleFilterCols = ScaleFilterCols_Any_MSA; 1312 if (IS_ALIGNED(dst_width, 16)) { 1313 ScaleFilterCols = ScaleFilterCols_MSA; 1314 } 1315 } 1316 #endif 1280 1317 if (!filtering && src_width * 2 == dst_width && x < 0x8000) { 1281 1318 ScaleFilterCols = ScaleColsUp2_C; … … 1664 1701 return; 1665 1702 } 1666 if (dst_width == src_width ) {1703 if (dst_width == src_width && filtering != kFilterBox) { 1667 1704 int dy = FixedDiv(src_height, dst_height); 1668 1705 // Arbitrary scale vertically, but unscaled vertically. … … 1693 1730 } 1694 1731 if (4 * dst_width == src_width && 4 * dst_height == src_height && 1695 filtering != kFilterBilinear) {1732 (filtering == kFilterBox || filtering == kFilterNone)) { 1696 1733 // optimized, 1/4 1697 1734 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, -
pjproject/trunk/third_party/yuv/source/scale_any.cc
r5633 r5699 34 34 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) 35 35 #endif 36 #ifdef HAS_SCALEFILTERCOLS_MSA 37 CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) 38 #endif 36 39 #ifdef HAS_SCALEARGBCOLS_NEON 37 40 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) 41 #endif 42 #ifdef HAS_SCALEARGBCOLS_MSA 43 CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) 38 44 #endif 39 45 #ifdef HAS_SCALEARGBFILTERCOLS_NEON … … 43 49 4, 44 50 3) 51 #endif 52 #ifdef HAS_SCALEARGBFILTERCOLS_MSA 53 CANY(ScaleARGBFilterCols_Any_MSA, 54 ScaleARGBFilterCols_MSA, 55 ScaleARGBFilterCols_C, 56 4, 57 7) 45 58 #endif 46 59 #undef CANY … … 229 242 23) 230 243 #endif 244 #ifdef HAS_SCALEROWDOWN34_MSA 245 SDANY(ScaleRowDown34_Any_MSA, 246 ScaleRowDown34_MSA, 247 ScaleRowDown34_C, 248 4 / 3, 249 1, 250 47) 251 SDANY(ScaleRowDown34_0_Box_Any_MSA, 252 ScaleRowDown34_0_Box_MSA, 253 ScaleRowDown34_0_Box_C, 254 4 / 3, 255 1, 256 47) 257 SDANY(ScaleRowDown34_1_Box_Any_MSA, 258 ScaleRowDown34_1_Box_MSA, 259 ScaleRowDown34_1_Box_C, 260 4 / 3, 261 1, 262 47) 263 #endif 231 264 #ifdef HAS_SCALEROWDOWN38_SSSE3 232 265 SDANY(ScaleRowDown38_Any_SSSE3, -
pjproject/trunk/third_party/yuv/source/scale_argb.cc
r5633 r5699 337 337 } 338 338 #endif 339 #if defined(HAS_SCALEARGBFILTERCOLS_MSA) 340 if (TestCpuFlag(kCpuHasMSA)) { 341 ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; 342 if (IS_ALIGNED(dst_width, 8)) { 343 ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; 344 } 345 } 346 #endif 339 347 // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. 340 348 // Allocate a row of ARGB. … … 443 451 } 444 452 #endif 453 #if defined(HAS_SCALEARGBFILTERCOLS_MSA) 454 if (filtering && TestCpuFlag(kCpuHasMSA)) { 455 ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; 456 if (IS_ALIGNED(dst_width, 8)) { 457 ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; 458 } 459 } 460 #endif 445 461 #if defined(HAS_SCALEARGBCOLS_SSE2) 446 462 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { … … 453 469 if (IS_ALIGNED(dst_width, 8)) { 454 470 ScaleARGBFilterCols = ScaleARGBCols_NEON; 471 } 472 } 473 #endif 474 #if defined(HAS_SCALEARGBCOLS_MSA) 475 if (!filtering && TestCpuFlag(kCpuHasMSA)) { 476 ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; 477 if (IS_ALIGNED(dst_width, 4)) { 478 ScaleARGBFilterCols = ScaleARGBCols_MSA; 455 479 } 456 480 } … … 644 668 } 645 669 #endif 670 #if defined(HAS_SCALEARGBFILTERCOLS_MSA) 671 if (filtering && TestCpuFlag(kCpuHasMSA)) { 672 ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; 673 if (IS_ALIGNED(dst_width, 8)) { 674 ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; 675 } 676 } 677 #endif 646 678 #if defined(HAS_SCALEARGBCOLS_SSE2) 647 679 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { … … 654 686 if (IS_ALIGNED(dst_width, 8)) { 655 687 ScaleARGBFilterCols = ScaleARGBCols_NEON; 688 } 689 } 690 #endif 691 #if defined(HAS_SCALEARGBCOLS_MSA) 692 if (!filtering && TestCpuFlag(kCpuHasMSA)) { 693 ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; 694 if (IS_ALIGNED(dst_width, 4)) { 695 ScaleARGBFilterCols = ScaleARGBCols_MSA; 656 696 } 657 697 } … … 777 817 if (IS_ALIGNED(dst_width, 8)) { 778 818 ScaleARGBCols = ScaleARGBCols_NEON; 819 } 820 } 821 #endif 822 #if defined(HAS_SCALEARGBCOLS_MSA) 823 if (TestCpuFlag(kCpuHasMSA)) { 824 ScaleARGBCols = ScaleARGBCols_Any_MSA; 825 if (IS_ALIGNED(dst_width, 4)) { 826 ScaleARGBCols = ScaleARGBCols_MSA; 779 827 } 780 828 } -
pjproject/trunk/third_party/yuv/source/scale_common.cc
r5633 r5699 1307 1307 #undef CENTERSTART 1308 1308 1309 // Read 8x2 upsample with filtering and write 16x1. 1310 // actually reads an extra pixel, so 9x2. 1311 void ScaleRowUp2_16_C(const uint16* src_ptr, 1312 ptrdiff_t src_stride, 1313 uint16* dst, 1314 int dst_width) { 1315 const uint16* src2 = src_ptr + src_stride; 1316 1317 int x; 1318 for (x = 0; x < dst_width - 1; x += 2) { 1319 uint16 p0 = src_ptr[0]; 1320 uint16 p1 = src_ptr[1]; 1321 uint16 p2 = src2[0]; 1322 uint16 p3 = src2[1]; 1323 dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; 1324 dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; 1325 ++src_ptr; 1326 ++src2; 1327 dst += 2; 1328 } 1329 if (dst_width & 1) { 1330 uint16 p0 = src_ptr[0]; 1331 uint16 p1 = src_ptr[1]; 1332 uint16 p2 = src2[0]; 1333 uint16 p3 = src2[1]; 1334 dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; 1335 } 1336 } 1337 1309 1338 #ifdef __cplusplus 1310 1339 } // extern "C" -
pjproject/trunk/third_party/yuv/source/scale_msa.cc
r5633 r5699 21 21 extern "C" { 22 22 #endif 23 24 #define LOAD_INDEXED_DATA(srcp, indx0, out0) \ 25 { \ 26 out0[0] = srcp[indx0[0]]; \ 27 out0[1] = srcp[indx0[1]]; \ 28 out0[2] = srcp[indx0[2]]; \ 29 out0[3] = srcp[indx0[3]]; \ 30 } 23 31 24 32 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, … … 546 554 } 547 555 556 void ScaleFilterCols_MSA(uint8* dst_ptr, 557 const uint8* src_ptr, 558 int dst_width, 559 int x, 560 int dx) { 561 int j; 562 v4i32 vec_x = __msa_fill_w(x); 563 v4i32 vec_dx = __msa_fill_w(dx); 564 v4i32 vec_const = {0, 1, 2, 3}; 565 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 566 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 567 v8u16 reg0, reg1; 568 v16u8 dst0; 569 v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); 570 v4i32 const_0x40 = __msa_fill_w(0x40); 571 572 vec0 = vec_dx * vec_const; 573 vec1 = vec_dx * 4; 574 vec_x += vec0; 575 576 for (j = 0; j < dst_width - 1; j += 16) { 577 vec2 = vec_x >> 16; 578 vec6 = vec_x & const_0xFFFF; 579 vec_x += vec1; 580 vec3 = vec_x >> 16; 581 vec7 = vec_x & const_0xFFFF; 582 vec_x += vec1; 583 vec4 = vec_x >> 16; 584 vec8 = vec_x & const_0xFFFF; 585 vec_x += vec1; 586 vec5 = vec_x >> 16; 587 vec9 = vec_x & const_0xFFFF; 588 vec_x += vec1; 589 vec6 >>= 9; 590 vec7 >>= 9; 591 vec8 >>= 9; 592 vec9 >>= 9; 593 LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); 594 LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); 595 LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); 596 LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); 597 vec2 += 1; 598 vec3 += 1; 599 vec4 += 1; 600 vec5 += 1; 601 LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); 602 LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); 603 LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); 604 LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); 605 tmp4 -= tmp0; 606 tmp5 -= tmp1; 607 tmp6 -= tmp2; 608 tmp7 -= tmp3; 609 tmp4 *= vec6; 610 tmp5 *= vec7; 611 tmp6 *= vec8; 612 tmp7 *= vec9; 613 tmp4 += const_0x40; 614 tmp5 += const_0x40; 615 tmp6 += const_0x40; 616 tmp7 += const_0x40; 617 tmp4 >>= 7; 618 tmp5 >>= 7; 619 tmp6 >>= 7; 620 tmp7 >>= 7; 621 tmp0 += tmp4; 622 tmp1 += tmp5; 623 tmp2 += tmp6; 624 tmp3 += tmp7; 625 reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 626 reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 627 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 628 __msa_st_b(dst0, dst_ptr, 0); 629 dst_ptr += 16; 630 } 631 } 632 633 void ScaleARGBCols_MSA(uint8* dst_argb, 634 const uint8* src_argb, 635 int dst_width, 636 int x, 637 int dx) { 638 const uint32* src = (const uint32*)(src_argb); 639 uint32* dst = (uint32*)(dst_argb); 640 int j; 641 v4i32 x_vec = __msa_fill_w(x); 642 v4i32 dx_vec = __msa_fill_w(dx); 643 v4i32 const_vec = {0, 1, 2, 3}; 644 v4i32 vec0, vec1, vec2; 645 v4i32 dst0; 646 647 vec0 = dx_vec * const_vec; 648 vec1 = dx_vec * 4; 649 x_vec += vec0; 650 651 for (j = 0; j < dst_width; j += 4) { 652 vec2 = x_vec >> 16; 653 x_vec += vec1; 654 LOAD_INDEXED_DATA(src, vec2, dst0); 655 __msa_st_w(dst0, dst, 0); 656 dst += 4; 657 } 658 } 659 660 void ScaleARGBFilterCols_MSA(uint8* dst_argb, 661 const uint8* src_argb, 662 int dst_width, 663 int x, 664 int dx) { 665 const uint32* src = (const uint32*)(src_argb); 666 int j; 667 v4u32 src0, src1, src2, src3; 668 v4u32 vec0, vec1, vec2, vec3; 669 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 670 v16u8 mult0, mult1, mult2, mult3; 671 v8u16 tmp0, tmp1, tmp2, tmp3; 672 v16u8 dst0, dst1; 673 v4u32 vec_x = (v4u32)__msa_fill_w(x); 674 v4u32 vec_dx = (v4u32)__msa_fill_w(dx); 675 v4u32 vec_const = {0, 1, 2, 3}; 676 v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); 677 678 vec0 = vec_dx * vec_const; 679 vec1 = vec_dx * 4; 680 vec_x += vec0; 681 682 for (j = 0; j < dst_width - 1; j += 8) { 683 vec2 = vec_x >> 16; 684 reg0 = (v16u8)(vec_x >> 9); 685 vec_x += vec1; 686 vec3 = vec_x >> 16; 687 reg1 = (v16u8)(vec_x >> 9); 688 vec_x += vec1; 689 reg0 = reg0 & const_0x7f; 690 reg1 = reg1 & const_0x7f; 691 reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); 692 reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); 693 reg2 = reg0 ^ const_0x7f; 694 reg3 = reg1 ^ const_0x7f; 695 mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); 696 mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); 697 mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); 698 mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); 699 LOAD_INDEXED_DATA(src, vec2, src0); 700 LOAD_INDEXED_DATA(src, vec3, src1); 701 vec2 += 1; 702 vec3 += 1; 703 LOAD_INDEXED_DATA(src, vec2, src2); 704 LOAD_INDEXED_DATA(src, vec3, src3); 705 reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 706 reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 707 reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 708 reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 709 tmp0 = __msa_dotp_u_h(reg4, mult0); 710 tmp1 = __msa_dotp_u_h(reg5, mult1); 711 tmp2 = __msa_dotp_u_h(reg6, mult2); 712 tmp3 = __msa_dotp_u_h(reg7, mult3); 713 tmp0 >>= 7; 714 tmp1 >>= 7; 715 tmp2 >>= 7; 716 tmp3 >>= 7; 717 dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 718 dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 719 __msa_st_b(dst0, dst_argb, 0); 720 __msa_st_b(dst1, dst_argb, 16); 721 dst_argb += 32; 722 } 723 } 724 725 void ScaleRowDown34_MSA(const uint8* src_ptr, 726 ptrdiff_t src_stride, 727 uint8* dst, 728 int dst_width) { 729 int x; 730 (void)src_stride; 731 v16u8 src0, src1, src2, src3; 732 v16u8 vec0, vec1, vec2; 733 v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; 734 v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; 735 v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, 736 21, 23, 24, 25, 27, 28, 29, 31}; 737 738 assert((dst_width % 3 == 0) && (dst_width > 0)); 739 740 for (x = 0; x < dst_width; x += 48) { 741 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 742 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 743 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); 744 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); 745 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 746 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); 747 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); 748 __msa_st_b((v16i8)vec0, dst, 0); 749 __msa_st_b((v16i8)vec1, dst, 16); 750 __msa_st_b((v16i8)vec2, dst, 32); 751 src_ptr += 64; 752 dst += 48; 753 } 754 } 755 756 void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr, 757 ptrdiff_t src_stride, 758 uint8* d, 759 int dst_width) { 760 const uint8* s = src_ptr; 761 const uint8* t = src_ptr + src_stride; 762 int x; 763 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; 764 v16u8 vec0, vec1, vec2, vec3, vec4, vec5; 765 v16u8 vec6, vec7, vec8, vec9, vec10, vec11; 766 v8i16 reg0, reg1, reg2, reg3, reg4, reg5; 767 v8i16 reg6, reg7, reg8, reg9, reg10, reg11; 768 v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; 769 v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; 770 v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; 771 v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 772 v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 773 16, 17, 17, 18, 18, 19, 20, 21}; 774 v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; 775 v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; 776 v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; 777 v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; 778 779 assert((dst_width % 3 == 0) && (dst_width > 0)); 780 781 for (x = 0; x < dst_width; x += 48) { 782 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 783 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 784 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 785 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 786 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 787 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 788 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 789 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 790 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); 791 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 792 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); 793 vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); 794 vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 795 vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); 796 vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); 797 vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 798 vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); 799 vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); 800 vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); 801 vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); 802 reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); 803 reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); 804 reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); 805 reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); 806 reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); 807 reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); 808 reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); 809 reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); 810 reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); 811 reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); 812 reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); 813 reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); 814 reg0 = __msa_srar_h(reg0, shft0); 815 reg1 = __msa_srar_h(reg1, shft1); 816 reg2 = __msa_srar_h(reg2, shft2); 817 reg3 = __msa_srar_h(reg3, shft0); 818 reg4 = __msa_srar_h(reg4, shft1); 819 reg5 = __msa_srar_h(reg5, shft2); 820 reg6 = __msa_srar_h(reg6, shft0); 821 reg7 = __msa_srar_h(reg7, shft1); 822 reg8 = __msa_srar_h(reg8, shft2); 823 reg9 = __msa_srar_h(reg9, shft0); 824 reg10 = __msa_srar_h(reg10, shft1); 825 reg11 = __msa_srar_h(reg11, shft2); 826 reg0 = reg0 * 3 + reg6; 827 reg1 = reg1 * 3 + reg7; 828 reg2 = reg2 * 3 + reg8; 829 reg3 = reg3 * 3 + reg9; 830 reg4 = reg4 * 3 + reg10; 831 reg5 = reg5 * 3 + reg11; 832 reg0 = __msa_srari_h(reg0, 2); 833 reg1 = __msa_srari_h(reg1, 2); 834 reg2 = __msa_srari_h(reg2, 2); 835 reg3 = __msa_srari_h(reg3, 2); 836 reg4 = __msa_srari_h(reg4, 2); 837 reg5 = __msa_srari_h(reg5, 2); 838 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 839 dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); 840 dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 841 __msa_st_b((v16i8)dst0, d, 0); 842 __msa_st_b((v16i8)dst1, d, 16); 843 __msa_st_b((v16i8)dst2, d, 32); 844 s += 64; 845 t += 64; 846 d += 48; 847 } 848 } 849 850 void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr, 851 ptrdiff_t src_stride, 852 uint8* d, 853 int dst_width) { 854 const uint8* s = src_ptr; 855 const uint8* t = src_ptr + src_stride; 856 int x; 857 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; 858 v16u8 vec0, vec1, vec2, vec3, vec4, vec5; 859 v16u8 vec6, vec7, vec8, vec9, vec10, vec11; 860 v8i16 reg0, reg1, reg2, reg3, reg4, reg5; 861 v8i16 reg6, reg7, reg8, reg9, reg10, reg11; 862 v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; 863 v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; 864 v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; 865 v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 866 v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 867 16, 17, 17, 18, 18, 19, 20, 21}; 868 v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; 869 v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; 870 v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; 871 v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; 872 873 assert((dst_width % 3 == 0) && (dst_width > 0)); 874 875 for (x = 0; x < dst_width; x += 48) { 876 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 877 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 878 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 879 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 880 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 881 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 882 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 883 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 884 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); 885 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 886 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); 887 vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); 888 vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 889 vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); 890 vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); 891 vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 892 vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); 893 vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); 894 vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); 895 vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); 896 reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); 897 reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); 898 reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); 899 reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); 900 reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); 901 reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); 902 reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); 903 reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); 904 reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); 905 reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); 906 reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); 907 reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); 908 reg0 = __msa_srar_h(reg0, shft0); 909 reg1 = __msa_srar_h(reg1, shft1); 910 reg2 = __msa_srar_h(reg2, shft2); 911 reg3 = __msa_srar_h(reg3, shft0); 912 reg4 = __msa_srar_h(reg4, shft1); 913 reg5 = __msa_srar_h(reg5, shft2); 914 reg6 = __msa_srar_h(reg6, shft0); 915 reg7 = __msa_srar_h(reg7, shft1); 916 reg8 = __msa_srar_h(reg8, shft2); 917 reg9 = __msa_srar_h(reg9, shft0); 918 reg10 = __msa_srar_h(reg10, shft1); 919 reg11 = __msa_srar_h(reg11, shft2); 920 reg0 += reg6; 921 reg1 += reg7; 922 reg2 += reg8; 923 reg3 += reg9; 924 reg4 += reg10; 925 reg5 += reg11; 926 reg0 = __msa_srari_h(reg0, 1); 927 reg1 = __msa_srari_h(reg1, 1); 928 reg2 = __msa_srari_h(reg2, 1); 929 reg3 = __msa_srari_h(reg3, 1); 930 reg4 = __msa_srari_h(reg4, 1); 931 reg5 = __msa_srari_h(reg5, 1); 932 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 933 dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); 934 dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 935 __msa_st_b((v16i8)dst0, d, 0); 936 __msa_st_b((v16i8)dst1, d, 16); 937 __msa_st_b((v16i8)dst2, d, 32); 938 s += 64; 939 t += 64; 940 d += 48; 941 } 942 } 943 548 944 #ifdef __cplusplus 549 945 } // extern "C" -
pjproject/trunk/third_party/yuv/source/scale_neon.cc
r5633 r5699 30 30 (void)src_stride; 31 31 asm volatile( 32 "1: 32 "1: \n" 33 33 // load even pixels into q0, odd into q1 34 34 "vld2.8 {q0, q1}, [%0]! \n" … … 51 51 (void)src_stride; 52 52 asm volatile( 53 "1: \n" 54 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post 55 // inc 53 "1: \n" 54 "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels 56 55 "subs %2, %2, #16 \n" // 16 processed per loop 57 "vpaddl.u8 q0, q0 \n" // add adjacent 58 "vpaddl.u8 q1, q1 \n" 59 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and 60 // pack 61 "vrshrn.u16 d1, q1, #1 \n" 56 "vrhadd.u8 q0, q0, q1 \n" // rounding half add 62 57 "vst1.8 {q0}, [%1]! \n" 63 58 "bgt 1b \n" … … 78 73 // change the stride to row 2 pointer 79 74 "add %1, %0 \n" 80 "1: 75 "1: \n" 81 76 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 82 77 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc … … 107 102 (void)src_stride; 108 103 asm volatile( 109 "1: 104 "1: \n" 110 105 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 111 106 "subs %2, %2, #8 \n" // 8 processed per loop … … 127 122 const uint8* src_ptr3 = src_ptr + src_stride * 3; 128 123 asm volatile( 129 "1: 124 "1: \n" 130 125 "vld1.8 {q0}, [%0]! \n" // load up 16x4 131 126 "vld1.8 {q1}, [%3]! \n" … … 161 156 (void)src_stride; 162 157 asm volatile( 163 "1: 164 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0165 "subs %2, %2, #24 \n"166 "vmov d2, d3 \n" // order d0, d1, d2167 "vst3.8 {d0, d1, d2}, [%1]! \n"168 "bgt 1b \n"158 "1: \n" 159 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 160 "subs %2, %2, #24 \n" 161 "vmov d2, d3 \n" // order d0, d1, d2 162 "vst3.8 {d0, d1, d2}, [%1]! \n" 163 "bgt 1b \n" 169 164 : "+r"(src_ptr), // %0 170 165 "+r"(dst_ptr), // %1 … … 181 176 "vmov.u8 d24, #3 \n" 182 177 "add %3, %0 \n" 183 "1: 178 "1: \n" 184 179 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 185 180 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 … … 238 233 "vmov.u8 d24, #3 \n" 239 234 "add %3, %0 \n" 240 "1: 235 "1: \n" 241 236 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 242 237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 … … 286 281 asm volatile( 287 282 "vld1.8 {q3}, [%3] \n" 288 "1: 283 "1: \n" 289 284 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" 290 285 "subs %2, %2, #12 \n" … … 313 308 "vld1.8 {q15}, [%7] \n" 314 309 "add %3, %0 \n" 315 "1: 310 "1: \n" 316 311 317 312 // d0 = 00 40 01 41 02 42 03 43 … … 422 417 "vld1.8 {q14}, [%5] \n" 423 418 "add %3, %0 \n" 424 "1: 419 "1: \n" 425 420 426 421 // d0 = 00 40 01 41 02 42 03 43 … … 514 509 const uint8* src_tmp; 515 510 asm volatile( 516 "1: 511 "1: \n" 517 512 "mov %0, %1 \n" 518 513 "mov r12, %5 \n" 519 514 "veor q2, q2, q2 \n" 520 515 "veor q3, q3, q3 \n" 521 "2: 516 "2: \n" 522 517 // load 16 pixels into q0 523 518 "vld1.8 {q0}, [%0], %3 \n" … … 541 536 } 542 537 543 // clang-format off544 538 // TODO(Yang Zhang): Investigate less load instructions for 545 539 // the x/dx stepping 546 #define LOAD2_DATA8_LANE(n) \ 547 "lsr %5, %3, #16 \n" \ 548 "add %6, %1, %5 \n" \ 549 "add %3, %3, %4 \n" \ 550 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" 551 // clang-format on 540 #define LOAD2_DATA8_LANE(n) \ 541 "lsr %5, %3, #16 \n" \ 542 "add %6, %1, %5 \n" \ 543 "add %3, %3, %4 \n" \ 544 "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" 552 545 553 546 // The NEON version mimics this formula (from row_common.cc): … … 640 633 "vdup.8 d4, %4 \n" 641 634 // General purpose row blend. 642 "1: 635 "1: \n" 643 636 "vld1.8 {q0}, [%1]! \n" 644 637 "vld1.8 {q1}, [%2]! \n" … … 655 648 656 649 // Blend 25 / 75. 657 "25: 650 "25: \n" 658 651 "vld1.8 {q0}, [%1]! \n" 659 652 "vld1.8 {q1}, [%2]! \n" … … 666 659 667 660 // Blend 50 / 50. 668 "50: 661 "50: \n" 669 662 "vld1.8 {q0}, [%1]! \n" 670 663 "vld1.8 {q1}, [%2]! \n" … … 676 669 677 670 // Blend 75 / 25. 678 "75: 671 "75: \n" 679 672 "vld1.8 {q1}, [%1]! \n" 680 673 "vld1.8 {q0}, [%2]! \n" … … 687 680 688 681 // Blend 100 / 0 - Copy row unchanged. 689 "100: 682 "100: \n" 690 683 "vld1.8 {q0}, [%1]! \n" 691 684 "subs %3, %3, #16 \n" … … 693 686 "bgt 100b \n" 694 687 695 "99: 688 "99: \n" 696 689 "vst1.8 {d1[7]}, [%0] \n" 697 690 : "+r"(dst_ptr), // %0 … … 710 703 (void)src_stride; 711 704 asm volatile( 712 "1: \n" 713 // load even pixels into q0, odd into q1 714 "vld2.32 {q0, q1}, [%0]! \n" 715 "vld2.32 {q2, q3}, [%0]! \n" 705 "1: \n" 706 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 707 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 716 708 "subs %2, %2, #8 \n" // 8 processed per loop 717 "v st1.8 {q1}, [%1]! \n" // store odd pixels718 "vst 1.8 {q3}, [%1]! \n"709 "vmov q2, q1 \n" // load next 8 ARGB 710 "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels 719 711 "bgt 1b \n" 720 712 : "+r"(src_ptr), // %0 … … 725 717 ); 726 718 } 719 720 // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! 721 // 4a: 3e04 subs r6, #4 722 // 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! 723 // 50: ef64 21f4 vorr q9, q10, q10 724 // 54: f942 038d vst2.32 {d16-d19}, [r2]! 725 // 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46> 727 726 728 727 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, … … 732 731 (void)src_stride; 733 732 asm volatile( 734 "1: \n" 735 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 736 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 737 // pixels. 733 "1: \n" 734 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 735 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 738 736 "subs %2, %2, #8 \n" // 8 processed per loop 739 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 740 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 741 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 742 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 743 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and 744 // pack 745 "vrshrn.u16 d1, q1, #1 \n" 746 "vrshrn.u16 d2, q2, #1 \n" 747 "vrshrn.u16 d3, q3, #1 \n" 748 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" 737 "vrhadd.u8 q0, q0, q1 \n" // rounding half add 738 "vrhadd.u8 q1, q2, q3 \n" // rounding half add 739 "vst2.32 {q0, q1}, [%1]! \n" 749 740 "bgt 1b \n" 750 741 : "+r"(src_argb), // %0 … … 763 754 // change the stride to row 2 pointer 764 755 "add %1, %1, %0 \n" 765 "1: 756 "1: \n" 766 757 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 767 758 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 768 // pixels.769 759 "subs %3, %3, #8 \n" // 8 processed per loop. 770 760 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. … … 773 763 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 774 764 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB 775 // pixels.776 765 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB 777 // pixels.778 766 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. 779 767 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. 780 768 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. 781 769 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. 782 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and 783 // pack 770 "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes 784 771 "vrshrn.u16 d1, q1, #2 \n" 785 772 "vrshrn.u16 d2, q2, #2 \n" … … 805 792 asm volatile( 806 793 "mov r12, %3, lsl #2 \n" 807 "1: 794 "1: \n" 808 795 "vld1.32 {d0[0]}, [%0], r12 \n" 809 796 "vld1.32 {d0[1]}, [%0], r12 \n" … … 830 817 "mov r12, %4, lsl #2 \n" 831 818 "add %1, %1, %0 \n" 832 "1: \n" 833 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 834 // 2x1 819 "1: \n" 820 "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 835 821 "vld1.8 {d1}, [%1], r12 \n" 836 822 "vld1.8 {d2}, [%0], r12 \n" … … 861 847 } 862 848 863 // clang-format off864 849 // TODO(Yang Zhang): Investigate less load instructions for 865 850 // the x/dx stepping 866 #define LOAD1_DATA32_LANE(dn, n) 867 "lsr %5, %3, #16 \n" 868 "add %6, %1, %5, lsl #2 \n" 869 "add %3, %3, %4 \n" 851 #define LOAD1_DATA32_LANE(dn, n) \ 852 "lsr %5, %3, #16 \n" \ 853 "add %6, %1, %5, lsl #2 \n" \ 854 "add %3, %3, %4 \n" \ 870 855 "vld1.32 {" #dn "[" #n "]}, [%6] \n" 871 // clang-format on872 856 873 857 void ScaleARGBCols_NEON(uint8* dst_argb, … … 879 863 const uint8* src_tmp = src_argb; 880 864 asm volatile( 881 "1: \n" LOAD1_DATA32_LANE( 882 d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0) 883 LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE( 884 d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1) 885 886 "vst1.32 {q0, q1}, [%0]! \n" // store pixels 887 "subs %2, %2, #8 \n" // 8 processed per 888 // loop 889 "bgt 1b \n" 865 "1: \n" 866 // clang-format off 867 LOAD1_DATA32_LANE(d0, 0) 868 LOAD1_DATA32_LANE(d0, 1) 869 LOAD1_DATA32_LANE(d1, 0) 870 LOAD1_DATA32_LANE(d1, 1) 871 LOAD1_DATA32_LANE(d2, 0) 872 LOAD1_DATA32_LANE(d2, 1) 873 LOAD1_DATA32_LANE(d3, 0) 874 LOAD1_DATA32_LANE(d3, 1) 875 // clang-format on 876 "vst1.32 {q0, q1}, [%0]! \n" // store pixels 877 "subs %2, %2, #8 \n" // 8 processed per loop 878 "bgt 1b \n" 890 879 : "+r"(dst_argb), // %0 891 880 "+r"(src_argb), // %1 … … 901 890 #undef LOAD1_DATA32_LANE 902 891 903 // clang-format off904 892 // TODO(Yang Zhang): Investigate less load instructions for 905 893 // the x/dx stepping 906 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ 907 "lsr %5, %3, #16 \n" \ 908 "add %6, %1, %5, lsl #2 \n" \ 909 "add %3, %3, %4 \n" \ 910 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" 911 // clang-format on 894 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ 895 "lsr %5, %3, #16 \n" \ 896 "add %6, %1, %5, lsl #2 \n" \ 897 "add %3, %3, %4 \n" \ 898 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" 912 899 913 900 void ScaleARGBFilterCols_NEON(uint8* dst_argb, -
pjproject/trunk/third_party/yuv/source/scale_neon64.cc
r5633 r5699 27 27 int dst_width) { 28 28 (void)src_stride; 29 asm volatile 30 "1:\n"31 // load even pixels into v0, odd into v132 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"33 "subs %w2, %w2, #16 \n" // 16 processed per loop34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels35 "b.gt 1b \n"36 : "+r"(src_ptr),// %037 "+r"(dst),// %138 "+r"(dst_width)// %239 :40 : "v0", "v1"// Clobber List41 );29 asm volatile( 30 "1: \n" 31 // load even pixels into v0, odd into v1 32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 33 "subs %w2, %w2, #16 \n" // 16 processed per loop 34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 35 "b.gt 1b \n" 36 : "+r"(src_ptr), // %0 37 "+r"(dst), // %1 38 "+r"(dst_width) // %2 39 : 40 : "v0", "v1" // Clobber List 41 ); 42 42 } 43 43 … … 48 48 int dst_width) { 49 49 (void)src_stride; 50 asm volatile ( 51 "1: \n" 52 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc 53 "subs %w2, %w2, #16 \n" // 16 processed per loop 54 "uaddlp v0.8h, v0.16b \n" // add adjacent 55 "uaddlp v1.8h, v1.16b \n" 56 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 57 "rshrn2 v0.16b, v1.8h, #1 \n" 58 "st1 {v0.16b}, [%1], #16 \n" 59 "b.gt 1b \n" 60 : "+r"(src_ptr), // %0 61 "+r"(dst), // %1 62 "+r"(dst_width) // %2 63 : 64 : "v0", "v1" // Clobber List 65 ); 50 asm volatile( 51 "1: \n" 52 // load even pixels into v0, odd into v1 53 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 54 "subs %w2, %w2, #16 \n" // 16 processed per loop 55 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add 56 "st1 {v0.16b}, [%1], #16 \n" 57 "b.gt 1b \n" 58 : "+r"(src_ptr), // %0 59 "+r"(dst), // %1 60 "+r"(dst_width) // %2 61 : 62 : "v0", "v1" // Clobber List 63 ); 66 64 } 67 65 … … 71 69 uint8* dst, 72 70 int dst_width) { 73 asm volatile 74 // change the stride to row 2 pointer75 "add %1, %1, %0 \n"76 "1:\n"77 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc78 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc79 "subs %w3, %w3, #16 \n" // 16 processed per loop80 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent81 "uaddlp v1.8h, v1.16b \n"82 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row183 "uadalp v1.8h, v3.16b \n"84 "rshrn v0.8b, v0.8h, #2 \n" // downshift,round and pack85 "rshrn2 v0.16b, v1.8h, #2 \n"86 "st1 {v0.16b}, [%2], #16 \n"87 "b.gt 1b \n"88 : "+r"(src_ptr),// %089 "+r"(src_stride),// %190 "+r"(dst),// %291 "+r"(dst_width)// %392 :93 : "v0", "v1", "v2", "v3"// Clobber List94 );71 asm volatile( 72 // change the stride to row 2 pointer 73 "add %1, %1, %0 \n" 74 "1: \n" 75 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc 76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc 77 "subs %w3, %w3, #16 \n" // 16 processed per loop 78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent 79 "uaddlp v1.8h, v1.16b \n" 80 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent 81 "uadalp v1.8h, v3.16b \n" 82 "rshrn v0.8b, v0.8h, #2 \n" // round and pack 83 "rshrn2 v0.16b, v1.8h, #2 \n" 84 "st1 {v0.16b}, [%2], #16 \n" 85 "b.gt 1b \n" 86 : "+r"(src_ptr), // %0 87 "+r"(src_stride), // %1 88 "+r"(dst), // %2 89 "+r"(dst_width) // %3 90 : 91 : "v0", "v1", "v2", "v3" // Clobber List 92 ); 95 93 } 96 94 … … 100 98 int dst_width) { 101 99 (void)src_stride; 102 asm volatile ( 103 "1: \n" 104 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 105 "subs %w2, %w2, #8 \n" // 8 processed per loop 106 "st1 {v2.8b}, [%1], #8 \n" 107 "b.gt 1b \n" 108 : "+r"(src_ptr), // %0 109 "+r"(dst_ptr), // %1 110 "+r"(dst_width) // %2 111 : 112 : "v0", "v1", "v2", "v3", "memory", "cc" 113 ); 100 asm volatile( 101 "1: \n" 102 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 103 "subs %w2, %w2, #8 \n" // 8 processed per loop 104 "st1 {v2.8b}, [%1], #8 \n" 105 "b.gt 1b \n" 106 : "+r"(src_ptr), // %0 107 "+r"(dst_ptr), // %1 108 "+r"(dst_width) // %2 109 : 110 : "v0", "v1", "v2", "v3", "memory", "cc"); 114 111 } 115 112 … … 121 118 const uint8* src_ptr2 = src_ptr + src_stride * 2; 122 119 const uint8* src_ptr3 = src_ptr + src_stride * 3; 123 asm volatile ( 124 "1: \n" 125 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 126 "ld1 {v1.16b}, [%2], #16 \n" 127 "ld1 {v2.16b}, [%3], #16 \n" 128 "ld1 {v3.16b}, [%4], #16 \n" 129 "subs %w5, %w5, #4 \n" 130 "uaddlp v0.8h, v0.16b \n" 131 "uadalp v0.8h, v1.16b \n" 132 "uadalp v0.8h, v2.16b \n" 133 "uadalp v0.8h, v3.16b \n" 134 "addp v0.8h, v0.8h, v0.8h \n" 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 136 "st1 {v0.s}[0], [%1], #4 \n" 137 "b.gt 1b \n" 138 : "+r"(src_ptr), // %0 139 "+r"(dst_ptr), // %1 140 "+r"(src_ptr1), // %2 141 "+r"(src_ptr2), // %3 142 "+r"(src_ptr3), // %4 143 "+r"(dst_width) // %5 144 : 145 : "v0", "v1", "v2", "v3", "memory", "cc" 146 ); 120 asm volatile( 121 "1: \n" 122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 123 "ld1 {v1.16b}, [%2], #16 \n" 124 "ld1 {v2.16b}, [%3], #16 \n" 125 "ld1 {v3.16b}, [%4], #16 \n" 126 "subs %w5, %w5, #4 \n" 127 "uaddlp v0.8h, v0.16b \n" 128 "uadalp v0.8h, v1.16b \n" 129 "uadalp v0.8h, v2.16b \n" 130 "uadalp v0.8h, v3.16b \n" 131 "addp v0.8h, v0.8h, v0.8h \n" 132 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 133 "st1 {v0.s}[0], [%1], #4 \n" 134 "b.gt 1b \n" 135 : "+r"(src_ptr), // %0 136 "+r"(dst_ptr), // %1 137 "+r"(src_ptr1), // %2 138 "+r"(src_ptr2), // %3 139 "+r"(src_ptr3), // %4 140 "+r"(dst_width) // %5 141 : 142 : "v0", "v1", "v2", "v3", "memory", "cc"); 147 143 } 148 144 … … 155 151 int dst_width) { 156 152 (void)src_stride; 157 asm volatile ( 158 "1: \n" 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 160 "subs %w2, %w2, #24 \n" 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 162 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 163 "b.gt 1b \n" 164 : "+r"(src_ptr), // %0 165 "+r"(dst_ptr), // %1 166 "+r"(dst_width) // %2 167 : 168 : "v0", "v1", "v2", "v3", "memory", "cc" 169 ); 153 asm volatile( 154 "1: \n" 155 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 156 "subs %w2, %w2, #24 \n" 157 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 158 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 159 "b.gt 1b \n" 160 : "+r"(src_ptr), // %0 161 "+r"(dst_ptr), // %1 162 "+r"(dst_width) // %2 163 : 164 : "v0", "v1", "v2", "v3", "memory", "cc"); 170 165 } 171 166 … … 174 169 uint8* dst_ptr, 175 170 int dst_width) { 176 asm volatile ( 177 "movi v20.8b, #3 \n" 178 "add %3, %3, %0 \n" 179 "1: \n" 180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 181 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 182 "subs %w2, %w2, #24 \n" 183 184 // filter src line 0 with src line 1 185 // expand chars to shorts to allow for room 186 // when adding lines together 187 "ushll v16.8h, v4.8b, #0 \n" 188 "ushll v17.8h, v5.8b, #0 \n" 189 "ushll v18.8h, v6.8b, #0 \n" 190 "ushll v19.8h, v7.8b, #0 \n" 191 192 // 3 * line_0 + line_1 193 "umlal v16.8h, v0.8b, v20.8b \n" 194 "umlal v17.8h, v1.8b, v20.8b \n" 195 "umlal v18.8h, v2.8b, v20.8b \n" 196 "umlal v19.8h, v3.8b, v20.8b \n" 197 198 // (3 * line_0 + line_1) >> 2 199 "uqrshrn v0.8b, v16.8h, #2 \n" 200 "uqrshrn v1.8b, v17.8h, #2 \n" 201 "uqrshrn v2.8b, v18.8h, #2 \n" 202 "uqrshrn v3.8b, v19.8h, #2 \n" 203 204 // a0 = (src[0] * 3 + s[1] * 1) >> 2 205 "ushll v16.8h, v1.8b, #0 \n" 206 "umlal v16.8h, v0.8b, v20.8b \n" 207 "uqrshrn v0.8b, v16.8h, #2 \n" 208 209 // a1 = (src[1] * 1 + s[2] * 1) >> 1 210 "urhadd v1.8b, v1.8b, v2.8b \n" 211 212 // a2 = (src[2] * 1 + s[3] * 3) >> 2 213 "ushll v16.8h, v2.8b, #0 \n" 214 "umlal v16.8h, v3.8b, v20.8b \n" 215 "uqrshrn v2.8b, v16.8h, #2 \n" 216 217 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 218 219 "b.gt 1b \n" 220 : "+r"(src_ptr), // %0 221 "+r"(dst_ptr), // %1 222 "+r"(dst_width), // %2 223 "+r"(src_stride) // %3 224 : 225 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", 226 "v20", "memory", "cc" 227 ); 171 asm volatile( 172 "movi v20.8b, #3 \n" 173 "add %3, %3, %0 \n" 174 "1: \n" 175 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 176 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 177 "subs %w2, %w2, #24 \n" 178 179 // filter src line 0 with src line 1 180 // expand chars to shorts to allow for room 181 // when adding lines together 182 "ushll v16.8h, v4.8b, #0 \n" 183 "ushll v17.8h, v5.8b, #0 \n" 184 "ushll v18.8h, v6.8b, #0 \n" 185 "ushll v19.8h, v7.8b, #0 \n" 186 187 // 3 * line_0 + line_1 188 "umlal v16.8h, v0.8b, v20.8b \n" 189 "umlal v17.8h, v1.8b, v20.8b \n" 190 "umlal v18.8h, v2.8b, v20.8b \n" 191 "umlal v19.8h, v3.8b, v20.8b \n" 192 193 // (3 * line_0 + line_1) >> 2 194 "uqrshrn v0.8b, v16.8h, #2 \n" 195 "uqrshrn v1.8b, v17.8h, #2 \n" 196 "uqrshrn v2.8b, v18.8h, #2 \n" 197 "uqrshrn v3.8b, v19.8h, #2 \n" 198 199 // a0 = (src[0] * 3 + s[1] * 1) >> 2 200 "ushll v16.8h, v1.8b, #0 \n" 201 "umlal v16.8h, v0.8b, v20.8b \n" 202 "uqrshrn v0.8b, v16.8h, #2 \n" 203 204 // a1 = (src[1] * 1 + s[2] * 1) >> 1 205 "urhadd v1.8b, v1.8b, v2.8b \n" 206 207 // a2 = (src[2] * 1 + s[3] * 3) >> 2 208 "ushll v16.8h, v2.8b, #0 \n" 209 "umlal v16.8h, v3.8b, v20.8b \n" 210 "uqrshrn v2.8b, v16.8h, #2 \n" 211 212 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 213 214 "b.gt 1b \n" 215 : "+r"(src_ptr), // %0 216 "+r"(dst_ptr), // %1 217 "+r"(dst_width), // %2 218 "+r"(src_stride) // %3 219 : 220 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 221 "v19", "v20", "memory", "cc"); 228 222 } 229 223 … … 232 226 uint8* dst_ptr, 233 227 int dst_width) { 234 asm volatile ( 235 "movi v20.8b, #3 \n" 236 "add %3, %3, %0 \n" 237 "1: \n" 238 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 239 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 240 "subs %w2, %w2, #24 \n" 241 // average src line 0 with src line 1 242 "urhadd v0.8b, v0.8b, v4.8b \n" 243 "urhadd v1.8b, v1.8b, v5.8b \n" 244 "urhadd v2.8b, v2.8b, v6.8b \n" 245 "urhadd v3.8b, v3.8b, v7.8b \n" 246 247 // a0 = (src[0] * 3 + s[1] * 1) >> 2 248 "ushll v4.8h, v1.8b, #0 \n" 249 "umlal v4.8h, v0.8b, v20.8b \n" 250 "uqrshrn v0.8b, v4.8h, #2 \n" 251 252 // a1 = (src[1] * 1 + s[2] * 1) >> 1 253 "urhadd v1.8b, v1.8b, v2.8b \n" 254 255 // a2 = (src[2] * 1 + s[3] * 3) >> 2 256 "ushll v4.8h, v2.8b, #0 \n" 257 "umlal v4.8h, v3.8b, v20.8b \n" 258 "uqrshrn v2.8b, v4.8h, #2 \n" 259 260 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 261 "b.gt 1b \n" 262 : "+r"(src_ptr), // %0 263 "+r"(dst_ptr), // %1 264 "+r"(dst_width), // %2 265 "+r"(src_stride) // %3 266 : 267 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" 268 ); 228 asm volatile( 229 "movi v20.8b, #3 \n" 230 "add %3, %3, %0 \n" 231 "1: \n" 232 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 233 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 234 "subs %w2, %w2, #24 \n" 235 // average src line 0 with src line 1 236 "urhadd v0.8b, v0.8b, v4.8b \n" 237 "urhadd v1.8b, v1.8b, v5.8b \n" 238 "urhadd v2.8b, v2.8b, v6.8b \n" 239 "urhadd v3.8b, v3.8b, v7.8b \n" 240 241 // a0 = (src[0] * 3 + s[1] * 1) >> 2 242 "ushll v4.8h, v1.8b, #0 \n" 243 "umlal v4.8h, v0.8b, v20.8b \n" 244 "uqrshrn v0.8b, v4.8h, #2 \n" 245 246 // a1 = (src[1] * 1 + s[2] * 1) >> 1 247 "urhadd v1.8b, v1.8b, v2.8b \n" 248 249 // a2 = (src[2] * 1 + s[3] * 3) >> 2 250 "ushll v4.8h, v2.8b, #0 \n" 251 "umlal v4.8h, v3.8b, v20.8b \n" 252 "uqrshrn v2.8b, v4.8h, #2 \n" 253 254 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 255 "b.gt 1b \n" 256 : "+r"(src_ptr), // %0 257 "+r"(dst_ptr), // %1 258 "+r"(dst_width), // %2 259 "+r"(src_stride) // %3 260 : 261 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); 269 262 } 270 263 … … 283 276 int dst_width) { 284 277 (void)src_stride; 285 asm volatile ( 286 "ld1 {v3.16b}, [%3] \n" 287 "1: \n" 288 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 289 "subs %w2, %w2, #12 \n" 290 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 291 "st1 {v2.8b}, [%1], #8 \n" 292 "st1 {v2.s}[2], [%1], #4 \n" 293 "b.gt 1b \n" 294 : "+r"(src_ptr), // %0 295 "+r"(dst_ptr), // %1 296 "+r"(dst_width) // %2 297 : "r"(&kShuf38) // %3 298 : "v0", "v1", "v2", "v3", "memory", "cc" 299 ); 278 asm volatile( 279 "ld1 {v3.16b}, [%3] \n" 280 "1: \n" 281 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 282 "subs %w2, %w2, #12 \n" 283 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 284 "st1 {v2.8b}, [%1], #8 \n" 285 "st1 {v2.s}[2], [%1], #4 \n" 286 "b.gt 1b \n" 287 : "+r"(src_ptr), // %0 288 "+r"(dst_ptr), // %1 289 "+r"(dst_width) // %2 290 : "r"(&kShuf38) // %3 291 : "v0", "v1", "v2", "v3", "memory", "cc"); 300 292 } 301 293 … … 308 300 ptrdiff_t tmp_src_stride = src_stride; 309 301 310 asm volatile ( 311 "ld1 {v29.8h}, [%5] \n" 312 "ld1 {v30.16b}, [%6] \n" 313 "ld1 {v31.8h}, [%7] \n" 314 "add %2, %2, %0 \n" 315 "1: \n" 316 317 // 00 40 01 41 02 42 03 43 318 // 10 50 11 51 12 52 13 53 319 // 20 60 21 61 22 62 23 63 320 // 30 70 31 71 32 72 33 73 321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 322 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 323 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 324 "subs %w4, %w4, #12 \n" 325 326 // Shuffle the input data around to get align the data 327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 328 // 00 10 01 11 02 12 03 13 329 // 40 50 41 51 42 52 43 53 330 "trn1 v20.8b, v0.8b, v1.8b \n" 331 "trn2 v21.8b, v0.8b, v1.8b \n" 332 "trn1 v22.8b, v4.8b, v5.8b \n" 333 "trn2 v23.8b, v4.8b, v5.8b \n" 334 "trn1 v24.8b, v16.8b, v17.8b \n" 335 "trn2 v25.8b, v16.8b, v17.8b \n" 336 337 // 20 30 21 31 22 32 23 33 338 // 60 70 61 71 62 72 63 73 339 "trn1 v0.8b, v2.8b, v3.8b \n" 340 "trn2 v1.8b, v2.8b, v3.8b \n" 341 "trn1 v4.8b, v6.8b, v7.8b \n" 342 "trn2 v5.8b, v6.8b, v7.8b \n" 343 "trn1 v16.8b, v18.8b, v19.8b \n" 344 "trn2 v17.8b, v18.8b, v19.8b \n" 345 346 // 00+10 01+11 02+12 03+13 347 // 40+50 41+51 42+52 43+53 348 "uaddlp v20.4h, v20.8b \n" 349 "uaddlp v21.4h, v21.8b \n" 350 "uaddlp v22.4h, v22.8b \n" 351 "uaddlp v23.4h, v23.8b \n" 352 "uaddlp v24.4h, v24.8b \n" 353 "uaddlp v25.4h, v25.8b \n" 354 355 // 60+70 61+71 62+72 63+73 356 "uaddlp v1.4h, v1.8b \n" 357 "uaddlp v5.4h, v5.8b \n" 358 "uaddlp v17.4h, v17.8b \n" 359 360 // combine source lines 361 "add v20.4h, v20.4h, v22.4h \n" 362 "add v21.4h, v21.4h, v23.4h \n" 363 "add v20.4h, v20.4h, v24.4h \n" 364 "add v21.4h, v21.4h, v25.4h \n" 365 "add v2.4h, v1.4h, v5.4h \n" 366 "add v2.4h, v2.4h, v17.4h \n" 367 368 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 369 // + s[6 + st * 1] + s[7 + st * 1] 370 // + s[6 + st * 2] + s[7 + st * 2]) / 6 371 "sqrdmulh v2.8h, v2.8h, v29.8h \n" 372 "xtn v2.8b, v2.8h \n" 373 374 // Shuffle 2,3 reg around so that 2 can be added to the 375 // 0,1 reg and 3 can be added to the 4,5 reg. This 376 // requires expanding from u8 to u16 as the 0,1 and 4,5 377 // registers are already expanded. Then do transposes 378 // to get aligned. 379 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 380 "ushll v16.8h, v16.8b, #0 \n" 381 "uaddl v0.8h, v0.8b, v4.8b \n" 382 383 // combine source lines 384 "add v0.8h, v0.8h, v16.8h \n" 385 386 // xx 20 xx 21 xx 22 xx 23 387 // xx 30 xx 31 xx 32 xx 33 388 "trn1 v1.8h, v0.8h, v0.8h \n" 389 "trn2 v4.8h, v0.8h, v0.8h \n" 390 "xtn v0.4h, v1.4s \n" 391 "xtn v4.4h, v4.4s \n" 392 393 // 0+1+2, 3+4+5 394 "add v20.8h, v20.8h, v0.8h \n" 395 "add v21.8h, v21.8h, v4.8h \n" 396 397 // Need to divide, but can't downshift as the the value 398 // isn't a power of 2. So multiply by 65536 / n 399 // and take the upper 16 bits. 400 "sqrdmulh v0.8h, v20.8h, v31.8h \n" 401 "sqrdmulh v1.8h, v21.8h, v31.8h \n" 402 403 // Align for table lookup, vtbl requires registers to 404 // be adjacent 405 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 406 407 "st1 {v3.8b}, [%1], #8 \n" 408 "st1 {v3.s}[2], [%1], #4 \n" 409 "b.gt 1b \n" 410 : "+r"(src_ptr), // %0 411 "+r"(dst_ptr), // %1 412 "+r"(tmp_src_stride), // %2 413 "+r"(src_ptr1), // %3 414 "+r"(dst_width) // %4 415 : "r"(&kMult38_Div6), // %5 416 "r"(&kShuf38_2), // %6 417 "r"(&kMult38_Div9) // %7 418 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 419 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", 420 "v30", "v31", "memory", "cc" 421 ); 302 asm volatile( 303 "ld1 {v29.8h}, [%5] \n" 304 "ld1 {v30.16b}, [%6] \n" 305 "ld1 {v31.8h}, [%7] \n" 306 "add %2, %2, %0 \n" 307 "1: \n" 308 309 // 00 40 01 41 02 42 03 43 310 // 10 50 11 51 12 52 13 53 311 // 20 60 21 61 22 62 23 63 312 // 30 70 31 71 32 72 33 73 313 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 314 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 315 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 316 "subs %w4, %w4, #12 \n" 317 318 // Shuffle the input data around to get align the data 319 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 320 // 00 10 01 11 02 12 03 13 321 // 40 50 41 51 42 52 43 53 322 "trn1 v20.8b, v0.8b, v1.8b \n" 323 "trn2 v21.8b, v0.8b, v1.8b \n" 324 "trn1 v22.8b, v4.8b, v5.8b \n" 325 "trn2 v23.8b, v4.8b, v5.8b \n" 326 "trn1 v24.8b, v16.8b, v17.8b \n" 327 "trn2 v25.8b, v16.8b, v17.8b \n" 328 329 // 20 30 21 31 22 32 23 33 330 // 60 70 61 71 62 72 63 73 331 "trn1 v0.8b, v2.8b, v3.8b \n" 332 "trn2 v1.8b, v2.8b, v3.8b \n" 333 "trn1 v4.8b, v6.8b, v7.8b \n" 334 "trn2 v5.8b, v6.8b, v7.8b \n" 335 "trn1 v16.8b, v18.8b, v19.8b \n" 336 "trn2 v17.8b, v18.8b, v19.8b \n" 337 338 // 00+10 01+11 02+12 03+13 339 // 40+50 41+51 42+52 43+53 340 "uaddlp v20.4h, v20.8b \n" 341 "uaddlp v21.4h, v21.8b \n" 342 "uaddlp v22.4h, v22.8b \n" 343 "uaddlp v23.4h, v23.8b \n" 344 "uaddlp v24.4h, v24.8b \n" 345 "uaddlp v25.4h, v25.8b \n" 346 347 // 60+70 61+71 62+72 63+73 348 "uaddlp v1.4h, v1.8b \n" 349 "uaddlp v5.4h, v5.8b \n" 350 "uaddlp v17.4h, v17.8b \n" 351 352 // combine source lines 353 "add v20.4h, v20.4h, v22.4h \n" 354 "add v21.4h, v21.4h, v23.4h \n" 355 "add v20.4h, v20.4h, v24.4h \n" 356 "add v21.4h, v21.4h, v25.4h \n" 357 "add v2.4h, v1.4h, v5.4h \n" 358 "add v2.4h, v2.4h, v17.4h \n" 359 360 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 361 // + s[6 + st * 1] + s[7 + st * 1] 362 // + s[6 + st * 2] + s[7 + st * 2]) / 6 363 "sqrdmulh v2.8h, v2.8h, v29.8h \n" 364 "xtn v2.8b, v2.8h \n" 365 366 // Shuffle 2,3 reg around so that 2 can be added to the 367 // 0,1 reg and 3 can be added to the 4,5 reg. This 368 // requires expanding from u8 to u16 as the 0,1 and 4,5 369 // registers are already expanded. Then do transposes 370 // to get aligned. 371 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 372 "ushll v16.8h, v16.8b, #0 \n" 373 "uaddl v0.8h, v0.8b, v4.8b \n" 374 375 // combine source lines 376 "add v0.8h, v0.8h, v16.8h \n" 377 378 // xx 20 xx 21 xx 22 xx 23 379 // xx 30 xx 31 xx 32 xx 33 380 "trn1 v1.8h, v0.8h, v0.8h \n" 381 "trn2 v4.8h, v0.8h, v0.8h \n" 382 "xtn v0.4h, v1.4s \n" 383 "xtn v4.4h, v4.4s \n" 384 385 // 0+1+2, 3+4+5 386 "add v20.8h, v20.8h, v0.8h \n" 387 "add v21.8h, v21.8h, v4.8h \n" 388 389 // Need to divide, but can't downshift as the the value 390 // isn't a power of 2. So multiply by 65536 / n 391 // and take the upper 16 bits. 392 "sqrdmulh v0.8h, v20.8h, v31.8h \n" 393 "sqrdmulh v1.8h, v21.8h, v31.8h \n" 394 395 // Align for table lookup, vtbl requires registers to be adjacent 396 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 397 398 "st1 {v3.8b}, [%1], #8 \n" 399 "st1 {v3.s}[2], [%1], #4 \n" 400 "b.gt 1b \n" 401 : "+r"(src_ptr), // %0 402 "+r"(dst_ptr), // %1 403 "+r"(tmp_src_stride), // %2 404 "+r"(src_ptr1), // %3 405 "+r"(dst_width) // %4 406 : "r"(&kMult38_Div6), // %5 407 "r"(&kShuf38_2), // %6 408 "r"(&kMult38_Div9) // %7 409 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 410 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", 411 "memory", "cc"); 422 412 } 423 413 … … 429 419 // TODO(fbarchard): use src_stride directly for clang 3.5+. 430 420 ptrdiff_t tmp_src_stride = src_stride; 431 asm volatile ( 432 "ld1 {v30.8h}, [%4] \n" 433 "ld1 {v31.16b}, [%5] \n" 434 "add %2, %2, %0 \n" 435 "1: \n" 436 437 // 00 40 01 41 02 42 03 43 438 // 10 50 11 51 12 52 13 53 439 // 20 60 21 61 22 62 23 63 440 // 30 70 31 71 32 72 33 73 441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 442 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 443 "subs %w3, %w3, #12 \n" 444 445 // Shuffle the input data around to get align the data 446 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 447 // 00 10 01 11 02 12 03 13 448 // 40 50 41 51 42 52 43 53 449 "trn1 v16.8b, v0.8b, v1.8b \n" 450 "trn2 v17.8b, v0.8b, v1.8b \n" 451 "trn1 v18.8b, v4.8b, v5.8b \n" 452 "trn2 v19.8b, v4.8b, v5.8b \n" 453 454 // 20 30 21 31 22 32 23 33 455 // 60 70 61 71 62 72 63 73 456 "trn1 v0.8b, v2.8b, v3.8b \n" 457 "trn2 v1.8b, v2.8b, v3.8b \n" 458 "trn1 v4.8b, v6.8b, v7.8b \n" 459 "trn2 v5.8b, v6.8b, v7.8b \n" 460 461 // 00+10 01+11 02+12 03+13 462 // 40+50 41+51 42+52 43+53 463 "uaddlp v16.4h, v16.8b \n" 464 "uaddlp v17.4h, v17.8b \n" 465 "uaddlp v18.4h, v18.8b \n" 466 "uaddlp v19.4h, v19.8b \n" 467 468 // 60+70 61+71 62+72 63+73 469 "uaddlp v1.4h, v1.8b \n" 470 "uaddlp v5.4h, v5.8b \n" 471 472 // combine source lines 473 "add v16.4h, v16.4h, v18.4h \n" 474 "add v17.4h, v17.4h, v19.4h \n" 475 "add v2.4h, v1.4h, v5.4h \n" 476 477 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 478 "uqrshrn v2.8b, v2.8h, #2 \n" 479 480 // Shuffle 2,3 reg around so that 2 can be added to the 481 // 0,1 reg and 3 can be added to the 4,5 reg. This 482 // requires expanding from u8 to u16 as the 0,1 and 4,5 483 // registers are already expanded. Then do transposes 484 // to get aligned. 485 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 486 487 // combine source lines 488 "uaddl v0.8h, v0.8b, v4.8b \n" 489 490 // xx 20 xx 21 xx 22 xx 23 491 // xx 30 xx 31 xx 32 xx 33 492 "trn1 v1.8h, v0.8h, v0.8h \n" 493 "trn2 v4.8h, v0.8h, v0.8h \n" 494 "xtn v0.4h, v1.4s \n" 495 "xtn v4.4h, v4.4s \n" 496 497 // 0+1+2, 3+4+5 498 "add v16.8h, v16.8h, v0.8h \n" 499 "add v17.8h, v17.8h, v4.8h \n" 500 501 // Need to divide, but can't downshift as the the value 502 // isn't a power of 2. So multiply by 65536 / n 503 // and take the upper 16 bits. 504 "sqrdmulh v0.8h, v16.8h, v30.8h \n" 505 "sqrdmulh v1.8h, v17.8h, v30.8h \n" 506 507 // Align for table lookup, vtbl requires registers to 508 // be adjacent 509 510 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 511 512 "st1 {v3.8b}, [%1], #8 \n" 513 "st1 {v3.s}[2], [%1], #4 \n" 514 "b.gt 1b \n" 515 : "+r"(src_ptr), // %0 516 "+r"(dst_ptr), // %1 517 "+r"(tmp_src_stride), // %2 518 "+r"(dst_width) // %3 519 : "r"(&kMult38_Div6), // %4 520 "r"(&kShuf38_2) // %5 521 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 522 "v18", "v19", "v30", "v31", "memory", "cc" 523 ); 421 asm volatile( 422 "ld1 {v30.8h}, [%4] \n" 423 "ld1 {v31.16b}, [%5] \n" 424 "add %2, %2, %0 \n" 425 "1: \n" 426 427 // 00 40 01 41 02 42 03 43 428 // 10 50 11 51 12 52 13 53 429 // 20 60 21 61 22 62 23 63 430 // 30 70 31 71 32 72 33 73 431 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 432 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 433 "subs %w3, %w3, #12 \n" 434 435 // Shuffle the input data around to get align the data 436 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 437 // 00 10 01 11 02 12 03 13 438 // 40 50 41 51 42 52 43 53 439 "trn1 v16.8b, v0.8b, v1.8b \n" 440 "trn2 v17.8b, v0.8b, v1.8b \n" 441 "trn1 v18.8b, v4.8b, v5.8b \n" 442 "trn2 v19.8b, v4.8b, v5.8b \n" 443 444 // 20 30 21 31 22 32 23 33 445 // 60 70 61 71 62 72 63 73 446 "trn1 v0.8b, v2.8b, v3.8b \n" 447 "trn2 v1.8b, v2.8b, v3.8b \n" 448 "trn1 v4.8b, v6.8b, v7.8b \n" 449 "trn2 v5.8b, v6.8b, v7.8b \n" 450 451 // 00+10 01+11 02+12 03+13 452 // 40+50 41+51 42+52 43+53 453 "uaddlp v16.4h, v16.8b \n" 454 "uaddlp v17.4h, v17.8b \n" 455 "uaddlp v18.4h, v18.8b \n" 456 "uaddlp v19.4h, v19.8b \n" 457 458 // 60+70 61+71 62+72 63+73 459 "uaddlp v1.4h, v1.8b \n" 460 "uaddlp v5.4h, v5.8b \n" 461 462 // combine source lines 463 "add v16.4h, v16.4h, v18.4h \n" 464 "add v17.4h, v17.4h, v19.4h \n" 465 "add v2.4h, v1.4h, v5.4h \n" 466 467 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 468 "uqrshrn v2.8b, v2.8h, #2 \n" 469 470 // Shuffle 2,3 reg around so that 2 can be added to the 471 // 0,1 reg and 3 can be added to the 4,5 reg. This 472 // requires expanding from u8 to u16 as the 0,1 and 4,5 473 // registers are already expanded. Then do transposes 474 // to get aligned. 475 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 476 477 // combine source lines 478 "uaddl v0.8h, v0.8b, v4.8b \n" 479 480 // xx 20 xx 21 xx 22 xx 23 481 // xx 30 xx 31 xx 32 xx 33 482 "trn1 v1.8h, v0.8h, v0.8h \n" 483 "trn2 v4.8h, v0.8h, v0.8h \n" 484 "xtn v0.4h, v1.4s \n" 485 "xtn v4.4h, v4.4s \n" 486 487 // 0+1+2, 3+4+5 488 "add v16.8h, v16.8h, v0.8h \n" 489 "add v17.8h, v17.8h, v4.8h \n" 490 491 // Need to divide, but can't downshift as the the value 492 // isn't a power of 2. So multiply by 65536 / n 493 // and take the upper 16 bits. 494 "sqrdmulh v0.8h, v16.8h, v30.8h \n" 495 "sqrdmulh v1.8h, v17.8h, v30.8h \n" 496 497 // Align for table lookup, vtbl requires registers to 498 // be adjacent 499 500 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 501 502 "st1 {v3.8b}, [%1], #8 \n" 503 "st1 {v3.s}[2], [%1], #4 \n" 504 "b.gt 1b \n" 505 : "+r"(src_ptr), // %0 506 "+r"(dst_ptr), // %1 507 "+r"(tmp_src_stride), // %2 508 "+r"(dst_width) // %3 509 : "r"(&kMult38_Div6), // %4 510 "r"(&kShuf38_2) // %5 511 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 512 "v19", "v30", "v31", "memory", "cc"); 524 513 } 525 514 … … 530 519 int src_height) { 531 520 const uint8* src_tmp; 532 asm volatile ( 533 "1: \n" 534 "mov %0, %1 \n" 535 "mov w12, %w5 \n" 536 "eor v2.16b, v2.16b, v2.16b \n" 537 "eor v3.16b, v3.16b, v3.16b \n" 538 "2: \n" 539 // load 16 pixels into q0 540 "ld1 {v0.16b}, [%0], %3 \n" 541 "uaddw2 v3.8h, v3.8h, v0.16b \n" 542 "uaddw v2.8h, v2.8h, v0.8b \n" 543 "subs w12, w12, #1 \n" 544 "b.gt 2b \n" 545 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 546 "add %1, %1, #16 \n" 547 "subs %w4, %w4, #16 \n" // 16 processed per loop 548 "b.gt 1b \n" 549 : "=&r"(src_tmp), // %0 550 "+r"(src_ptr), // %1 551 "+r"(dst_ptr), // %2 552 "+r"(src_stride), // %3 553 "+r"(src_width), // %4 554 "+r"(src_height) // %5 555 : 556 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 557 ); 558 } 559 560 // clang-format off 521 asm volatile( 522 "1: \n" 523 "mov %0, %1 \n" 524 "mov w12, %w5 \n" 525 "eor v2.16b, v2.16b, v2.16b \n" 526 "eor v3.16b, v3.16b, v3.16b \n" 527 "2: \n" 528 // load 16 pixels into q0 529 "ld1 {v0.16b}, [%0], %3 \n" 530 "uaddw2 v3.8h, v3.8h, v0.16b \n" 531 "uaddw v2.8h, v2.8h, v0.8b \n" 532 "subs w12, w12, #1 \n" 533 "b.gt 2b \n" 534 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 535 "add %1, %1, #16 \n" 536 "subs %w4, %w4, #16 \n" // 16 processed per loop 537 "b.gt 1b \n" 538 : "=&r"(src_tmp), // %0 539 "+r"(src_ptr), // %1 540 "+r"(dst_ptr), // %2 541 "+r"(src_stride), // %3 542 "+r"(src_width), // %4 543 "+r"(src_height) // %5 544 : 545 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 546 ); 547 } 548 561 549 // TODO(Yang Zhang): Investigate less load instructions for 562 550 // the x/dx stepping 563 #define LOAD2_DATA8_LANE(n) 564 "lsr %5, %3, #16 \n" 565 "add %6, %1, %5 \n" 566 "add %3, %3, %4 \n" 551 #define LOAD2_DATA8_LANE(n) \ 552 "lsr %5, %3, #16 \n" \ 553 "add %6, %1, %5 \n" \ 554 "add %3, %3, %4 \n" \ 567 555 "ld2 {v4.b, v5.b}[" #n "], [%6] \n" 568 // clang-format on569 556 570 557 // The NEON version mimics this formula (from row_common.cc): … … 580 567 int* tmp = dx_offset; 581 568 const uint8* src_tmp = src_ptr; 582 int64 x64 = (int64)x; 583 int64 dx64 = (int64)dx; 569 int64 x64 = (int64)x; // NOLINT 570 int64 dx64 = (int64)dx; // NOLINT 584 571 asm volatile ( 585 572 "dup v0.4s, %w3 \n" // x … … 645 632 int source_y_fraction) { 646 633 int y_fraction = 256 - source_y_fraction; 647 asm volatile ( 648 "cmp %w4, #0 \n" 649 "b.eq 100f \n" 650 "add %2, %2, %1 \n" 651 "cmp %w4, #64 \n" 652 "b.eq 75f \n" 653 "cmp %w4, #128 \n" 654 "b.eq 50f \n" 655 "cmp %w4, #192 \n" 656 "b.eq 25f \n" 657 658 "dup v5.8b, %w4 \n" 659 "dup v4.8b, %w5 \n" 660 // General purpose row blend. 661 "1: \n" 662 "ld1 {v0.16b}, [%1], #16 \n" 663 "ld1 {v1.16b}, [%2], #16 \n" 664 "subs %w3, %w3, #16 \n" 665 "umull v6.8h, v0.8b, v4.8b \n" 666 "umull2 v7.8h, v0.16b, v4.16b \n" 667 "umlal v6.8h, v1.8b, v5.8b \n" 668 "umlal2 v7.8h, v1.16b, v5.16b \n" 669 "rshrn v0.8b, v6.8h, #8 \n" 670 "rshrn2 v0.16b, v7.8h, #8 \n" 671 "st1 {v0.16b}, [%0], #16 \n" 672 "b.gt 1b \n" 673 "b 99f \n" 674 675 // Blend 25 / 75. 676 "25: \n" 677 "ld1 {v0.16b}, [%1], #16 \n" 678 "ld1 {v1.16b}, [%2], #16 \n" 679 "subs %w3, %w3, #16 \n" 680 "urhadd v0.16b, v0.16b, v1.16b \n" 681 "urhadd v0.16b, v0.16b, v1.16b \n" 682 "st1 {v0.16b}, [%0], #16 \n" 683 "b.gt 25b \n" 684 "b 99f \n" 685 686 // Blend 50 / 50. 687 "50: \n" 688 "ld1 {v0.16b}, [%1], #16 \n" 689 "ld1 {v1.16b}, [%2], #16 \n" 690 "subs %w3, %w3, #16 \n" 691 "urhadd v0.16b, v0.16b, v1.16b \n" 692 "st1 {v0.16b}, [%0], #16 \n" 693 "b.gt 50b \n" 694 "b 99f \n" 695 696 // Blend 75 / 25. 697 "75: \n" 698 "ld1 {v1.16b}, [%1], #16 \n" 699 "ld1 {v0.16b}, [%2], #16 \n" 700 "subs %w3, %w3, #16 \n" 701 "urhadd v0.16b, v0.16b, v1.16b \n" 702 "urhadd v0.16b, v0.16b, v1.16b \n" 703 "st1 {v0.16b}, [%0], #16 \n" 704 "b.gt 75b \n" 705 "b 99f \n" 706 707 // Blend 100 / 0 - Copy row unchanged. 708 "100: \n" 709 "ld1 {v0.16b}, [%1], #16 \n" 710 "subs %w3, %w3, #16 \n" 711 "st1 {v0.16b}, [%0], #16 \n" 712 "b.gt 100b \n" 713 714 "99: \n" 715 "st1 {v0.b}[15], [%0] \n" 716 : "+r"(dst_ptr), // %0 717 "+r"(src_ptr), // %1 718 "+r"(src_stride), // %2 719 "+r"(dst_width), // %3 720 "+r"(source_y_fraction),// %4 721 "+r"(y_fraction) // %5 722 : 723 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" 724 ); 634 asm volatile( 635 "cmp %w4, #0 \n" 636 "b.eq 100f \n" 637 "add %2, %2, %1 \n" 638 "cmp %w4, #64 \n" 639 "b.eq 75f \n" 640 "cmp %w4, #128 \n" 641 "b.eq 50f \n" 642 "cmp %w4, #192 \n" 643 "b.eq 25f \n" 644 645 "dup v5.8b, %w4 \n" 646 "dup v4.8b, %w5 \n" 647 // General purpose row blend. 648 "1: \n" 649 "ld1 {v0.16b}, [%1], #16 \n" 650 "ld1 {v1.16b}, [%2], #16 \n" 651 "subs %w3, %w3, #16 \n" 652 "umull v6.8h, v0.8b, v4.8b \n" 653 "umull2 v7.8h, v0.16b, v4.16b \n" 654 "umlal v6.8h, v1.8b, v5.8b \n" 655 "umlal2 v7.8h, v1.16b, v5.16b \n" 656 "rshrn v0.8b, v6.8h, #8 \n" 657 "rshrn2 v0.16b, v7.8h, #8 \n" 658 "st1 {v0.16b}, [%0], #16 \n" 659 "b.gt 1b \n" 660 "b 99f \n" 661 662 // Blend 25 / 75. 663 "25: \n" 664 "ld1 {v0.16b}, [%1], #16 \n" 665 "ld1 {v1.16b}, [%2], #16 \n" 666 "subs %w3, %w3, #16 \n" 667 "urhadd v0.16b, v0.16b, v1.16b \n" 668 "urhadd v0.16b, v0.16b, v1.16b \n" 669 "st1 {v0.16b}, [%0], #16 \n" 670 "b.gt 25b \n" 671 "b 99f \n" 672 673 // Blend 50 / 50. 674 "50: \n" 675 "ld1 {v0.16b}, [%1], #16 \n" 676 "ld1 {v1.16b}, [%2], #16 \n" 677 "subs %w3, %w3, #16 \n" 678 "urhadd v0.16b, v0.16b, v1.16b \n" 679 "st1 {v0.16b}, [%0], #16 \n" 680 "b.gt 50b \n" 681 "b 99f \n" 682 683 // Blend 75 / 25. 684 "75: \n" 685 "ld1 {v1.16b}, [%1], #16 \n" 686 "ld1 {v0.16b}, [%2], #16 \n" 687 "subs %w3, %w3, #16 \n" 688 "urhadd v0.16b, v0.16b, v1.16b \n" 689 "urhadd v0.16b, v0.16b, v1.16b \n" 690 "st1 {v0.16b}, [%0], #16 \n" 691 "b.gt 75b \n" 692 "b 99f \n" 693 694 // Blend 100 / 0 - Copy row unchanged. 695 "100: \n" 696 "ld1 {v0.16b}, [%1], #16 \n" 697 "subs %w3, %w3, #16 \n" 698 "st1 {v0.16b}, [%0], #16 \n" 699 "b.gt 100b \n" 700 701 "99: \n" 702 "st1 {v0.b}[15], [%0] \n" 703 : "+r"(dst_ptr), // %0 704 "+r"(src_ptr), // %1 705 "+r"(src_stride), // %2 706 "+r"(dst_width), // %3 707 "+r"(source_y_fraction), // %4 708 "+r"(y_fraction) // %5 709 : 710 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); 725 711 } 726 712 … … 730 716 int dst_width) { 731 717 (void)src_stride; 732 asm volatile ( 733 "1: \n" 734 // load even pixels into q0, odd into q1 735 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" 736 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" 737 "subs %w2, %w2, #8 \n" // 8 processed per loop 738 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 739 "st1 {v3.16b}, [%1], #16 \n" 740 "b.gt 1b \n" 741 : "+r" (src_ptr), // %0 742 "+r" (dst), // %1 743 "+r" (dst_width) // %2 744 : 745 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 746 ); 718 asm volatile( 719 "1: \n" 720 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 721 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" 722 "subs %w2, %w2, #8 \n" // 8 processed per loop 723 "mov v2.16b, v3.16b \n" 724 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels 725 "b.gt 1b \n" 726 : "+r"(src_ptr), // %0 727 "+r"(dst), // %1 728 "+r"(dst_width) // %2 729 : 730 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 731 ); 747 732 } 748 733 … … 752 737 int dst_width) { 753 738 (void)src_stride; 754 asm volatile ( 755 "1: \n" 756 // load 8 ARGB pixels. 757 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" 758 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 760 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 761 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 762 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 763 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 764 "rshrn v1.8b, v1.8h, #1 \n" 765 "rshrn v2.8b, v2.8h, #1 \n" 766 "rshrn v3.8b, v3.8h, #1 \n" 767 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" 768 "b.gt 1b \n" 769 : "+r"(src_argb), // %0 770 "+r"(dst_argb), // %1 771 "+r"(dst_width) // %2 772 : 773 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 774 ); 739 asm volatile( 740 "1: \n" 741 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 742 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" 743 "subs %w2, %w2, #8 \n" // 8 processed per loop 744 745 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add 746 "urhadd v1.16b, v2.16b, v3.16b \n" 747 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels 748 "b.gt 1b \n" 749 : "+r"(src_argb), // %0 750 "+r"(dst_argb), // %1 751 "+r"(dst_width) // %2 752 : 753 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 754 ); 775 755 } 776 756 … … 779 759 uint8* dst, 780 760 int dst_width) { 781 asm volatile ( 782 // change the stride to row 2 pointer 783 "add %1, %1, %0 \n" 784 "1: \n" 785 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. 786 "subs %w3, %w3, #8 \n" // 8 processed per loop. 787 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 788 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 789 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 790 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 791 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. 792 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 793 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 794 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 795 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 796 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 797 "rshrn v1.8b, v1.8h, #2 \n" 798 "rshrn v2.8b, v2.8h, #2 \n" 799 "rshrn v3.8b, v3.8h, #2 \n" 800 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 801 "b.gt 1b \n" 802 : "+r" (src_ptr), // %0 803 "+r" (src_stride), // %1 804 "+r" (dst), // %2 805 "+r" (dst_width) // %3 806 : 807 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" 808 ); 761 asm volatile( 762 // change the stride to row 2 pointer 763 "add %1, %1, %0 \n" 764 "1: \n" 765 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB 766 "subs %w3, %w3, #8 \n" // 8 processed per loop. 767 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 768 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 769 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 770 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 771 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 772 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 773 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 774 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 775 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 776 "rshrn v0.8b, v0.8h, #2 \n" // round and pack 777 "rshrn v1.8b, v1.8h, #2 \n" 778 "rshrn v2.8b, v2.8h, #2 \n" 779 "rshrn v3.8b, v3.8h, #2 \n" 780 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 781 "b.gt 1b \n" 782 : "+r"(src_ptr), // %0 783 "+r"(src_stride), // %1 784 "+r"(dst), // %2 785 "+r"(dst_width) // %3 786 : 787 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 809 788 } 810 789 … … 817 796 int dst_width) { 818 797 (void)src_stride; 819 asm volatile ( 820 "1: \n" 821 "ld1 {v0.s}[0], [%0], %3 \n" 822 "ld1 {v0.s}[1], [%0], %3 \n" 823 "ld1 {v0.s}[2], [%0], %3 \n" 824 "ld1 {v0.s}[3], [%0], %3 \n" 825 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 826 "st1 {v0.16b}, [%1], #16 \n" 827 "b.gt 1b \n" 828 : "+r"(src_argb), // %0 829 "+r"(dst_argb), // %1 830 "+r"(dst_width) // %2 831 : "r"((int64)(src_stepx * 4)) // %3 832 : "memory", "cc", "v0" 833 ); 798 asm volatile( 799 "1: \n" 800 "ld1 {v0.s}[0], [%0], %3 \n" 801 "ld1 {v0.s}[1], [%0], %3 \n" 802 "ld1 {v0.s}[2], [%0], %3 \n" 803 "ld1 {v0.s}[3], [%0], %3 \n" 804 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 805 "st1 {v0.16b}, [%1], #16 \n" 806 "b.gt 1b \n" 807 : "+r"(src_argb), // %0 808 "+r"(dst_argb), // %1 809 "+r"(dst_width) // %2 810 : "r"((int64)(src_stepx * 4)) // %3 811 : "memory", "cc", "v0"); 834 812 } 835 813 … … 843 821 uint8* dst_argb, 844 822 int dst_width) { 845 asm volatile ( 846 "add %1, %1, %0 \n" 847 "1: \n" 848 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 849 "ld1 {v1.8b}, [%1], %4 \n" 850 "ld1 {v2.8b}, [%0], %4 \n" 851 "ld1 {v3.8b}, [%1], %4 \n" 852 "ld1 {v4.8b}, [%0], %4 \n" 853 "ld1 {v5.8b}, [%1], %4 \n" 854 "ld1 {v6.8b}, [%0], %4 \n" 855 "ld1 {v7.8b}, [%1], %4 \n" 856 "uaddl v0.8h, v0.8b, v1.8b \n" 857 "uaddl v2.8h, v2.8b, v3.8b \n" 858 "uaddl v4.8h, v4.8b, v5.8b \n" 859 "uaddl v6.8h, v6.8b, v7.8b \n" 860 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 861 "mov v0.d[1], v2.d[0] \n" 862 "mov v2.d[0], v16.d[1] \n" 863 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 864 "mov v4.d[1], v6.d[0] \n" 865 "mov v6.d[0], v16.d[1] \n" 866 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 867 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 868 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 869 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 870 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 871 "st1 {v0.16b}, [%2], #16 \n" 872 "b.gt 1b \n" 873 : "+r"(src_argb), // %0 874 "+r"(src_stride), // %1 875 "+r"(dst_argb), // %2 876 "+r"(dst_width) // %3 877 : "r"((int64)(src_stepx * 4)) // %4 878 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 879 ); 880 } 881 882 // clang-format off 823 asm volatile( 824 "add %1, %1, %0 \n" 825 "1: \n" 826 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 827 "ld1 {v1.8b}, [%1], %4 \n" 828 "ld1 {v2.8b}, [%0], %4 \n" 829 "ld1 {v3.8b}, [%1], %4 \n" 830 "ld1 {v4.8b}, [%0], %4 \n" 831 "ld1 {v5.8b}, [%1], %4 \n" 832 "ld1 {v6.8b}, [%0], %4 \n" 833 "ld1 {v7.8b}, [%1], %4 \n" 834 "uaddl v0.8h, v0.8b, v1.8b \n" 835 "uaddl v2.8h, v2.8b, v3.8b \n" 836 "uaddl v4.8h, v4.8b, v5.8b \n" 837 "uaddl v6.8h, v6.8b, v7.8b \n" 838 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 839 "mov v0.d[1], v2.d[0] \n" 840 "mov v2.d[0], v16.d[1] \n" 841 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 842 "mov v4.d[1], v6.d[0] \n" 843 "mov v6.d[0], v16.d[1] \n" 844 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 845 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 846 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 847 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 848 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 849 "st1 {v0.16b}, [%2], #16 \n" 850 "b.gt 1b \n" 851 : "+r"(src_argb), // %0 852 "+r"(src_stride), // %1 853 "+r"(dst_argb), // %2 854 "+r"(dst_width) // %3 855 : "r"((int64)(src_stepx * 4)) // %4 856 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 857 } 858 883 859 // TODO(Yang Zhang): Investigate less load instructions for 884 860 // the x/dx stepping 885 #define LOAD1_DATA32_LANE(vn, n) 886 "lsr %5, %3, #16 \n" 887 "add %6, %1, %5, lsl #2 \n" 888 "add %3, %3, %4 \n" 861 #define LOAD1_DATA32_LANE(vn, n) \ 862 "lsr %5, %3, #16 \n" \ 863 "add %6, %1, %5, lsl #2 \n" \ 864 "add %3, %3, %4 \n" \ 889 865 "ld1 {" #vn ".s}[" #n "], [%6] \n" 890 // clang-format on891 866 892 867 void ScaleARGBCols_NEON(uint8* dst_argb, … … 896 871 int dx) { 897 872 const uint8* src_tmp = src_argb; 898 int64 x64 = (int64)x; 899 int64 dx64 = (int64)dx; 873 int64 x64 = (int64)x; // NOLINT 874 int64 dx64 = (int64)dx; // NOLINT 900 875 int64 tmp64; 901 asm volatile 902 "1:\n"903 LOAD1_DATA32_LANE(v0, 0)904 LOAD1_DATA32_LANE(v0, 1)905 LOAD1_DATA32_LANE(v0, 2)906 LOAD1_DATA32_LANE(v0, 3)907 LOAD1_DATA32_LANE(v1, 0)908 LOAD1_DATA32_LANE(v1, 1)909 LOAD1_DATA32_LANE(v1, 2)910 LOAD1_DATA32_LANE(v1, 3)911 912 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels913 "subs %w2, %w2, #8 \n" // 8 processed per loop914 "b.gt 1b \n"915 : "+r"(dst_argb), // %0916 "+r"(src_argb), // %1917 "+r"(dst_width), // %2918 "+r"(x64), // %3919 "+r"(dx64), // %4920 "=&r"(tmp64), // %5921 "+r"(src_tmp) // %6922 :923 : "memory", "cc", "v0", "v1"924 );876 asm volatile( 877 "1: \n" 878 // clang-format off 879 LOAD1_DATA32_LANE(v0, 0) 880 LOAD1_DATA32_LANE(v0, 1) 881 LOAD1_DATA32_LANE(v0, 2) 882 LOAD1_DATA32_LANE(v0, 3) 883 LOAD1_DATA32_LANE(v1, 0) 884 LOAD1_DATA32_LANE(v1, 1) 885 LOAD1_DATA32_LANE(v1, 2) 886 LOAD1_DATA32_LANE(v1, 3) 887 // clang-format on 888 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels 889 "subs %w2, %w2, #8 \n" // 8 processed per loop 890 "b.gt 1b \n" 891 : "+r"(dst_argb), // %0 892 "+r"(src_argb), // %1 893 "+r"(dst_width), // %2 894 "+r"(x64), // %3 895 "+r"(dx64), // %4 896 "=&r"(tmp64), // %5 897 "+r"(src_tmp) // %6 898 : 899 : "memory", "cc", "v0", "v1"); 925 900 } 926 901 927 902 #undef LOAD1_DATA32_LANE 928 903 929 // clang-format off930 904 // TODO(Yang Zhang): Investigate less load instructions for 931 905 // the x/dx stepping 932 #define LOAD2_DATA32_LANE(vn1, vn2, n) 933 "lsr %5, %3, #16 \n" 934 "add %6, %1, %5, lsl #2 \n" 935 "add %3, %3, %4 \n" 906 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ 907 "lsr %5, %3, #16 \n" \ 908 "add %6, %1, %5, lsl #2 \n" \ 909 "add %3, %3, %4 \n" \ 936 910 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" 937 // clang-format on938 911 939 912 void ScaleARGBFilterCols_NEON(uint8* dst_argb, … … 945 918 int* tmp = dx_offset; 946 919 const uint8* src_tmp = src_argb; 947 int64 x64 = (int64)x; 948 int64 dx64 = (int64)dx; 920 int64 x64 = (int64)x; // NOLINT 921 int64 dx64 = (int64)dx; // NOLINT 949 922 asm volatile ( 950 923 "dup v0.4s, %w3 \n" // x … … 1002 975 #undef LOAD2_DATA32_LANE 1003 976 977 // Read 16x2 average down and write 8x1. 978 void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, 979 ptrdiff_t src_stride, 980 uint16* dst, 981 int dst_width) { 982 asm volatile( 983 // change the stride to row 2 pointer 984 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 985 "1: \n" 986 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc 987 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc 988 "subs %w3, %w3, #8 \n" // 8 processed per loop 989 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent 990 "uaddlp v1.4s, v1.8h \n" 991 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent 992 "uadalp v1.4s, v3.8h \n" 993 "rshrn v0.4h, v0.4s, #2 \n" // round and pack 994 "rshrn2 v0.8h, v1.4s, #2 \n" 995 "st1 {v0.8h}, [%2], #16 \n" 996 "b.gt 1b \n" 997 : "+r"(src_ptr), // %0 998 "+r"(src_stride), // %1 999 "+r"(dst), // %2 1000 "+r"(dst_width) // %3 1001 : 1002 : "v0", "v1", "v2", "v3" // Clobber List 1003 ); 1004 } 1005 1006 // Read 8x2 upsample with filtering and write 16x1. 1007 // Actually reads an extra pixel, so 9x2. 1008 void ScaleRowUp2_16_NEON(const uint16* src_ptr, 1009 ptrdiff_t src_stride, 1010 uint16* dst, 1011 int dst_width) { 1012 asm volatile( 1013 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 1014 "movi v0.8h, #9 \n" // constants 1015 "movi v1.4s, #3 \n" 1016 1017 "1: \n" 1018 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 1019 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 1020 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row 1021 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 1022 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop 1023 "umull v16.4s, v3.4h, v0.4h \n" 1024 "umull2 v7.4s, v3.8h, v0.8h \n" 1025 "umull v18.4s, v4.4h, v0.4h \n" 1026 "umull2 v17.4s, v4.8h, v0.8h \n" 1027 "uaddw v16.4s, v16.4s, v6.4h \n" 1028 "uaddl2 v19.4s, v6.8h, v3.8h \n" 1029 "uaddl v3.4s, v6.4h, v3.4h \n" 1030 "uaddw2 v6.4s, v7.4s, v6.8h \n" 1031 "uaddl2 v7.4s, v5.8h, v4.8h \n" 1032 "uaddl v4.4s, v5.4h, v4.4h \n" 1033 "uaddw v18.4s, v18.4s, v5.4h \n" 1034 "mla v16.4s, v4.4s, v1.4s \n" 1035 "mla v18.4s, v3.4s, v1.4s \n" 1036 "mla v6.4s, v7.4s, v1.4s \n" 1037 "uaddw2 v4.4s, v17.4s, v5.8h \n" 1038 "uqrshrn v16.4h, v16.4s, #4 \n" 1039 "mla v4.4s, v19.4s, v1.4s \n" 1040 "uqrshrn2 v16.8h, v6.4s, #4 \n" 1041 "uqrshrn v17.4h, v18.4s, #4 \n" 1042 "uqrshrn2 v17.8h, v4.4s, #4 \n" 1043 "st2 {v16.8h-v17.8h}, [%2], #32 \n" 1044 "b.gt 1b \n" 1045 : "+r"(src_ptr), // %0 1046 "+r"(src_stride), // %1 1047 "+r"(dst), // %2 1048 "+r"(dst_width) // %3 1049 : "r"(2LL), // %4 1050 "r"(14LL) // %5 1051 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 1052 "v19" // Clobber List 1053 ); 1054 } 1055 1004 1056 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 1005 1057 -
pjproject/trunk/third_party/yuv/source/scale_win.cc
r5633 r5699 18 18 19 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 21 21 22 22 // Offsets for source bytes 0 to 9 … … 817 817 pxor xmm5, xmm5 818 818 819 // sum rows819 // sum rows 820 820 xloop: 821 821 movdqu xmm3, [eax] // read 16 bytes … … 848 848 vpxor ymm5, ymm5, ymm5 849 849 850 // sum rows850 // sum rows 851 851 xloop: 852 852 vmovdqu ymm3, [eax] // read 32 bytes … … 940 940 jl xloop99 941 941 942 // 1 pixel remainder942 // 1 pixel remainder 943 943 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 944 944 movd xmm0, ebx … … 1195 1195 jl xloop49 1196 1196 1197 // 4 Pixel loop.1197 // 4 Pixel loop. 1198 1198 xloop4: 1199 1199 movd xmm0, [esi + eax * 4] // 1 source x0 pixels … … 1219 1219 je xloop29 1220 1220 1221 // 2 Pixels.1221 // 2 Pixels. 1222 1222 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1223 1223 movd xmm1, [esi + edx * 4] // 1 source x1 pixels … … 1232 1232 je xloop99 1233 1233 1234 // 1 Pixels.1234 // 1 Pixels. 1235 1235 movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1236 1236 movd dword ptr [edi], xmm0 … … 1310 1310 jl xloop99 1311 1311 1312 // 1 pixel remainder1312 // 1 pixel remainder 1313 1313 psrlw xmm2, 9 // 7 bit fractions. 1314 1314 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
Note: See TracChangeset
for help on using the changeset viewer.