Changeset 5699 for pjproject/trunk/third_party/yuv/source/row_neon64.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/row_neon64.cc
r5633 r5699 274 274 asm volatile( 275 275 YUVTORGB_SETUP 276 "1: 276 "1: \n" READYUV422 YUVTORGB( 277 277 v22, v21, 278 278 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 … … 311 311 YUVTORGB_SETUP 312 312 "movi v23.8b, #255 \n" 313 "1: 313 "1: \n" READYUV422 YUVTORGB( 314 314 v22, v21, 315 315 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 … … 396 396 asm volatile( 397 397 "movi v23.8b, #255 \n" 398 "1: 398 "1: \n" 399 399 "ld1 {v20.8b}, [%0], #8 \n" 400 400 "orr v21.8b, v20.8b, v20.8b \n" … … 471 471 asm volatile( 472 472 YUVTORGB_SETUP 473 "1: 473 "1: \n" READNV12 YUVTORGB( 474 474 v22, v21, 475 475 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 … … 545 545 int width) { 546 546 asm volatile( 547 "1: 547 "1: \n" 548 548 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 549 549 "subs %w3, %w3, #16 \n" // 16 processed per loop … … 566 566 int width) { 567 567 asm volatile( 568 "1: 568 "1: \n" 569 569 "ld1 {v0.16b}, [%0], #16 \n" // load U 570 570 "ld1 {v1.16b}, [%1], #16 \n" // load V … … 581 581 } 582 582 583 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 583 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. 584 void SplitRGBRow_NEON(const uint8* src_rgb, 585 uint8* dst_r, 586 uint8* dst_g, 587 uint8* dst_b, 588 int width) { 589 asm volatile( 590 "1: \n" 591 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB 592 "subs %w4, %w4, #16 \n" // 16 processed per loop 593 "st1 {v0.16b}, [%1], #16 \n" // store R 594 "st1 {v1.16b}, [%2], #16 \n" // store G 595 "st1 {v2.16b}, [%3], #16 \n" // store B 596 "b.gt 1b \n" 597 : "+r"(src_rgb), // %0 598 "+r"(dst_r), // %1 599 "+r"(dst_g), // %2 600 "+r"(dst_b), // %3 601 "+r"(width) // %4 602 : // Input registers 603 : "cc", "memory", "v0", "v1", "v2" // Clobber List 604 ); 605 } 606 607 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time 608 void MergeRGBRow_NEON(const uint8* src_r, 609 const uint8* src_g, 610 const uint8* src_b, 611 uint8* dst_rgb, 612 int width) { 613 asm volatile( 614 "1: \n" 615 "ld1 {v0.16b}, [%0], #16 \n" // load R 616 "ld1 {v1.16b}, [%1], #16 \n" // load G 617 "ld1 {v2.16b}, [%2], #16 \n" // load B 618 "subs %w4, %w4, #16 \n" // 16 processed per loop 619 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB 620 "b.gt 1b \n" 621 : "+r"(src_r), // %0 622 "+r"(src_g), // %1 623 "+r"(src_b), // %2 624 "+r"(dst_rgb), // %3 625 "+r"(width) // %4 626 : // Input registers 627 : "cc", "memory", "v0", "v1", "v2" // Clobber List 628 ); 629 } 630 631 // Copy multiple of 32. 584 632 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 585 633 asm volatile( 586 "1: 587 "ld 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32634 "1: \n" 635 "ldp q0, q1, [%0], #32 \n" 588 636 "subs %w2, %w2, #32 \n" // 32 processed per loop 589 "st 1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32590 "b.gt 1b \n" 591 : "+r"(src), 592 "+r"(dst), 593 "+r"(count) 594 : 595 : "cc", "memory", "v0", "v1" , "v2", "v3"// Clobber List637 "stp q0, q1, [%1], #32 \n" 638 "b.gt 1b \n" 639 : "+r"(src), // %0 640 "+r"(dst), // %1 641 "+r"(count) // %2 // Output registers 642 : // Input registers 643 : "cc", "memory", "v0", "v1" // Clobber List 596 644 ); 597 645 } … … 601 649 asm volatile( 602 650 "dup v0.16b, %w2 \n" // duplicate 16 bytes 603 "1: 651 "1: \n" 604 652 "subs %w1, %w1, #16 \n" // 16 bytes per loop 605 653 "st1 {v0.16b}, [%0], #16 \n" // store … … 614 662 asm volatile( 615 663 "dup v0.4s, %w2 \n" // duplicate 4 ints 616 "1: 664 "1: \n" 617 665 "subs %w1, %w1, #4 \n" // 4 ints per loop 618 666 "st1 {v0.16b}, [%0], #16 \n" // store … … 629 677 "add %0, %0, %w2, sxtw \n" 630 678 "sub %0, %0, #16 \n" 631 "1: 679 "1: \n" 632 680 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 633 681 "subs %w2, %w2, #16 \n" // 16 pixels per loop. … … 651 699 "add %0, %0, %w3, sxtw #1 \n" 652 700 "sub %0, %0, #16 \n" 653 "1: 701 "1: \n" 654 702 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 655 703 "subs %w3, %w3, #8 \n" // 8 pixels per loop. … … 672 720 "add %0, %0, %w2, sxtw #2 \n" 673 721 "sub %0, %0, #16 \n" 674 "1: 722 "1: \n" 675 723 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 676 724 "subs %w2, %w2, #4 \n" // 4 pixels per loop. … … 689 737 asm volatile( 690 738 "movi v4.8b, #255 \n" // Alpha 691 "1: 739 "1: \n" 692 740 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 693 741 "subs %w2, %w2, #8 \n" // 8 processed per loop. 694 742 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB 695 // pixels696 743 "b.gt 1b \n" 697 744 : "+r"(src_rgb24), // %0 … … 706 753 asm volatile( 707 754 "movi v5.8b, #255 \n" // Alpha 708 "1: 755 "1: \n" 709 756 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 710 757 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 723 770 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 724 771 asm volatile( 725 "1: 772 "1: \n" 726 773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 727 774 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 754 801 asm volatile( 755 802 "movi v3.8b, #255 \n" // Alpha 756 "1: 803 "1: \n" 757 804 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 758 805 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 806 RGB565TOARGB 760 807 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 761 // pixels762 808 "b.gt 1b \n" 763 809 : "+r"(src_rgb565), // %0 … … 811 857 asm volatile( 812 858 "movi v3.8b, #255 \n" // Alpha 813 "1: 859 "1: \n" 814 860 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 815 861 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 842 888 int width) { 843 889 asm volatile( 844 "1: 890 "1: \n" 845 891 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 846 892 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 859 905 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 860 906 asm volatile( 861 "1: 907 "1: \n" 862 908 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB 863 // pixels864 909 "subs %w2, %w2, #8 \n" // 8 processed per loop. 865 910 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of … … 876 921 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 877 922 asm volatile( 878 "1: 923 "1: \n" 879 924 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 880 925 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 893 938 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 894 939 asm volatile( 895 "1: 940 "1: \n" 896 941 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 897 942 "subs %w2, %w2, #16 \n" // 16 processed per loop. … … 908 953 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 909 954 asm volatile( 910 "1: 955 "1: \n" 911 956 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 912 957 "subs %w2, %w2, #16 \n" // 16 processed per loop. … … 926 971 int width) { 927 972 asm volatile( 928 "1: 973 "1: \n" 929 974 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 930 // pixels931 975 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 932 976 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. … … 947 991 int width) { 948 992 asm volatile( 949 "1: 993 "1: \n" 950 994 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY 951 // pixels952 995 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 953 996 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. … … 970 1013 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 971 1014 asm volatile( 972 "1: 1015 "1: \n" 973 1016 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 974 1017 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. … … 997 1040 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 998 1041 asm volatile( 999 "1: 1042 "1: \n" 1000 1043 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1001 1044 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. … … 1024 1067 asm volatile( 1025 1068 "ld1 {v2.16b}, [%3] \n" // shuffler 1026 "1: 1069 "1: \n" 1027 1070 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1028 1071 "subs %w2, %w2, #4 \n" // 4 processed per loop … … 1044 1087 int width) { 1045 1088 asm volatile( 1046 "1: 1089 "1: \n" 1047 1090 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1048 1091 "orr v2.8b, v1.8b, v1.8b \n" … … 1067 1110 int width) { 1068 1111 asm volatile( 1069 "1: 1112 "1: \n" 1070 1113 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1071 1114 "orr v3.8b, v2.8b, v2.8b \n" … … 1086 1129 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1087 1130 asm volatile( 1088 "1: 1131 "1: \n" 1089 1132 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1090 1133 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1105 1148 asm volatile( 1106 1149 "dup v1.4s, %w2 \n" // dither4 1107 "1: 1150 "1: \n" 1108 1151 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1109 1152 "subs %w3, %w3, #8 \n" // 8 processed per loop. … … 1124 1167 int width) { 1125 1168 asm volatile( 1126 "1: 1169 "1: \n" 1127 1170 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1128 1171 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1144 1187 "movi v4.16b, #0x0f \n" // bits to clear with 1145 1188 // vbic. 1146 "1: 1189 "1: \n" 1147 1190 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1148 1191 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1164 1207 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1165 1208 "movi v7.8b, #16 \n" // Add 16 constant 1166 "1: 1209 "1: \n" 1167 1210 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1168 // pixels.1169 1211 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1170 1212 "umull v3.8h, v0.8b, v4.8b \n" // B … … 1184 1226 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1185 1227 asm volatile( 1186 "1: 1228 "1: \n" 1187 1229 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 1188 1230 // pixels … … 1203 1245 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1204 1246 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1205 "1: 1247 "1: \n" 1206 1248 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1207 // pixels.1208 1249 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1209 1250 "umull v3.8h, v0.8b, v4.8b \n" // B … … 1233 1274 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1234 1275 "movi v29.16b,#0x80 \n" // 128.5 1235 "1: 1276 "1: \n" 1236 1277 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1237 1278 // pixels. … … 1271 1312 1272 1313 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1314 // clang-format off 1273 1315 #define RGBTOUV(QB, QG, QR) \ 1274 "mul v3.8h, " #QB \ 1275 ",v20.8h \n" /* B */ \ 1276 "mul v4.8h, " #QR \ 1277 ",v20.8h \n" /* R */ \ 1278 "mls v3.8h, " #QG \ 1279 ",v21.8h \n" /* G */ \ 1280 "mls v4.8h, " #QG \ 1281 ",v24.8h \n" /* G */ \ 1282 "mls v3.8h, " #QR \ 1283 ",v22.8h \n" /* R */ \ 1284 "mls v4.8h, " #QB \ 1285 ",v23.8h \n" /* B */ \ 1316 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1317 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1318 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1319 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1320 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1321 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1286 1322 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1287 1323 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1288 1324 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1289 1325 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1326 // clang-format on 1290 1327 1291 1328 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. … … 1579 1616 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1580 1617 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1581 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 1582 // 16-bit) 1583 "1: \n" 1618 "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit 1619 "1: \n" 1584 1620 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1585 1621 RGB565TOARGB … … 1646 1682 asm volatile( 1647 1683 RGBTOUV_SETUP_REG 1648 "1: 1684 "1: \n" 1649 1685 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1650 1686 RGB555TOARGB … … 1711 1747 asm volatile( 1712 1748 RGBTOUV_SETUP_REG 1713 "1: 1749 "1: \n" 1714 1750 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1715 1751 ARGB4444TOARGB … … 1775 1811 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1776 1812 "movi v27.8b, #16 \n" // Add 16 constant 1777 "1: 1813 "1: \n" 1778 1814 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1779 1815 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1800 1836 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1801 1837 "movi v7.8b, #16 \n" // Add 16 constant 1802 "1: 1838 "1: \n" 1803 1839 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1804 1840 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1824 1860 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1825 1861 "movi v27.8b, #16 \n" // Add 16 constant 1826 "1: 1862 "1: \n" 1827 1863 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1828 1864 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1848 1884 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1849 1885 "movi v7.8b, #16 \n" // Add 16 constant 1850 "1: 1886 "1: \n" 1851 1887 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1852 1888 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1871 1907 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1872 1908 "movi v7.8b, #16 \n" // Add 16 constant 1873 "1: 1909 "1: \n" 1874 1910 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1875 1911 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1894 1930 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1895 1931 "movi v7.8b, #16 \n" // Add 16 constant 1896 "1: 1932 "1: \n" 1897 1933 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1898 1934 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1917 1953 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1918 1954 "movi v7.8b, #16 \n" // Add 16 constant 1919 "1: 1955 "1: \n" 1920 1956 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1921 1957 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1940 1976 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1941 1977 "movi v7.8b, #16 \n" // Add 16 constant 1942 "1: 1978 "1: \n" 1943 1979 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1944 1980 "subs %w2, %w2, #8 \n" // 8 processed per loop. … … 1975 2011 "dup v4.16b, %w5 \n" 1976 2012 // General purpose row blend. 1977 "1: 2013 "1: \n" 1978 2014 "ld1 {v0.16b}, [%1], #16 \n" 1979 2015 "ld1 {v1.16b}, [%2], #16 \n" … … 1990 2026 1991 2027 // Blend 50 / 50. 1992 "50: 2028 "50: \n" 1993 2029 "ld1 {v0.16b}, [%1], #16 \n" 1994 2030 "ld1 {v1.16b}, [%2], #16 \n" … … 2000 2036 2001 2037 // Blend 100 / 0 - Copy row unchanged. 2002 "100: 2038 "100: \n" 2003 2039 "ld1 {v0.16b}, [%1], #16 \n" 2004 2040 "subs %w3, %w3, #16 \n" … … 2006 2042 "b.gt 100b \n" 2007 2043 2008 "99: 2044 "99: \n" 2009 2045 : "+r"(dst_ptr), // %0 2010 2046 "+r"(src_ptr), // %1 … … 2026 2062 "b.lt 89f \n" 2027 2063 // Blend 8 pixels. 2028 "8: 2064 "8: \n" 2029 2065 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 2030 2066 // pixels … … 2049 2085 "b.ge 8b \n" 2050 2086 2051 "89: 2087 "89: \n" 2052 2088 "adds %w3, %w3, #8-1 \n" 2053 2089 "b.lt 99f \n" 2054 2090 2055 2091 // Blend 1 pixels. 2056 "1: 2092 "1: \n" 2057 2093 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2058 2094 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. … … 2074 2110 "b.ge 1b \n" 2075 2111 2076 "99: 2112 "99: \n" 2077 2113 2078 2114 : "+r"(src_argb0), // %0 … … 2089 2125 asm volatile( 2090 2126 // Attenuate 8 pixels. 2091 "1: 2127 "1: \n" 2092 2128 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2093 // pixels2094 2129 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2095 2130 "umull v4.8h, v0.8b, v3.8b \n" // b * a … … 2123 2158 2124 2159 // 8 pixel loop. 2125 "1: \n" 2126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of 2127 // ARGB. 2160 "1: \n" 2161 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. 2128 2162 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2129 2163 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) … … 2143 2177 "uqxtn v2.8b, v2.8h \n" 2144 2178 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB 2145 // pixels2146 2179 "b.gt 1b \n" 2147 2180 : "+r"(dst_argb), // %0 … … 2166 2199 2167 2200 // 8 pixel loop. 2168 "1: 2201 "1: \n" 2169 2202 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB 2170 // pixels.2171 2203 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2172 2204 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) … … 2183 2215 "uqxtn v7.8b, v7.8h \n" 2184 2216 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB 2185 // pixels2186 2217 "b.gt 1b \n" 2187 2218 : "+r"(src_argb), // %0 … … 2200 2231 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2201 2232 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2202 "1: 2233 "1: \n" 2203 2234 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2204 // pixels.2205 2235 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2206 2236 "umull v4.8h, v0.8b, v24.8b \n" // B … … 2235 2265 "movi v29.8b, #98 \n" // BG coefficient 2236 2266 "movi v30.8b, #50 \n" // BR coefficient 2237 "1: 2267 "1: \n" 2238 2268 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2239 2269 "subs %w1, %w1, #8 \n" // 8 processed per loop. … … 2271 2301 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2272 2302 2273 "1: \n" 2274 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 2275 // pixels. 2303 "1: \n" 2304 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB 2276 2305 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2277 2306 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit … … 2311 2340 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2312 2341 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2313 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 2314 // pixels. 2342 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB 2315 2343 "b.gt 1b \n" 2316 2344 : "+r"(src_argb), // %0 … … 2330 2358 asm volatile( 2331 2359 // 8 pixel loop. 2332 "1: 2360 "1: \n" 2333 2361 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2334 // pixels.2335 2362 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2336 // pixels.2337 2363 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2338 2364 "umull v0.8h, v0.8b, v4.8b \n" // multiply B … … 2345 2371 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2346 2372 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2347 // pixels 2348 "b.gt 1b \n" 2349 2373 "b.gt 1b \n" 2350 2374 : "+r"(src_argb0), // %0 2351 2375 "+r"(src_argb1), // %1 … … 2363 2387 asm volatile( 2364 2388 // 8 pixel loop. 2365 "1: 2389 "1: \n" 2366 2390 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2367 // pixels.2368 2391 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2369 // pixels.2370 2392 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2371 2393 "uqadd v0.8b, v0.8b, v4.8b \n" … … 2374 2396 "uqadd v3.8b, v3.8b, v7.8b \n" 2375 2397 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2376 // pixels 2377 "b.gt 1b \n" 2378 2398 "b.gt 1b \n" 2379 2399 : "+r"(src_argb0), // %0 2380 2400 "+r"(src_argb1), // %1 … … 2392 2412 asm volatile( 2393 2413 // 8 pixel loop. 2394 "1: 2414 "1: \n" 2395 2415 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2396 // pixels.2397 2416 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2398 // pixels.2399 2417 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2400 2418 "uqsub v0.8b, v0.8b, v4.8b \n" … … 2403 2421 "uqsub v3.8b, v3.8b, v7.8b \n" 2404 2422 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2405 // pixels 2406 "b.gt 1b \n" 2407 2423 "b.gt 1b \n" 2408 2424 : "+r"(src_argb0), // %0 2409 2425 "+r"(src_argb1), // %1 … … 2426 2442 "movi v3.8b, #255 \n" // alpha 2427 2443 // 8 pixel loop. 2428 "1: 2444 "1: \n" 2429 2445 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2430 2446 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. … … 2434 2450 "orr v2.8b, v0.8b, v0.8b \n" 2435 2451 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2436 // pixels2437 2452 "b.gt 1b \n" 2438 2453 : "+r"(src_sobelx), // %0 … … 2451 2466 asm volatile( 2452 2467 // 16 pixel loop. 2453 "1: 2468 "1: \n" 2454 2469 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2455 2470 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. … … 2478 2493 "movi v3.8b, #255 \n" // alpha 2479 2494 // 8 pixel loop. 2480 "1: 2495 "1: \n" 2481 2496 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2482 2497 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. … … 2484 2499 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2485 2500 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2486 // pixels2487 2501 "b.gt 1b \n" 2488 2502 : "+r"(src_sobelx), // %0 … … 2504 2518 int width) { 2505 2519 asm volatile( 2506 "1: 2520 "1: \n" 2507 2521 "ld1 {v0.8b}, [%0],%5 \n" // top 2508 2522 "ld1 {v1.8b}, [%0],%6 \n" … … 2542 2556 int width) { 2543 2557 asm volatile( 2544 "1: 2558 "1: \n" 2545 2559 "ld1 {v0.8b}, [%0],%4 \n" // left 2546 2560 "ld1 {v1.8b}, [%1],%4 \n" … … 2573 2587 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { 2574 2588 asm volatile( 2575 "1: 2589 "1: \n" 2576 2590 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2577 2591 "subs %w2, %w2, #8 \n" // 8 pixels per loop … … 2593 2607 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { 2594 2608 asm volatile( 2595 "1: 2609 "1: \n" 2596 2610 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2597 2611 "subs %w2, %w2, #8 \n" // 8 pixels per loop … … 2613 2627 } 2614 2628 2629 float ScaleMaxSamples_NEON(const float* src, 2630 float* dst, 2631 float scale, 2632 int width) { 2633 float fmax; 2634 asm volatile( 2635 "movi v5.4s, #0 \n" // max 2636 "movi v6.4s, #0 \n" 2637 2638 "1: \n" 2639 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2640 "subs %w2, %w2, #8 \n" // 8 processed per loop 2641 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale 2642 "fmul v4.4s, v2.4s, %4.s[0] \n" // scale 2643 "fmax v5.4s, v5.4s, v1.4s \n" // max 2644 "fmax v6.4s, v6.4s, v2.4s \n" 2645 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples 2646 "b.gt 1b \n" 2647 "fmax v5.4s, v5.4s, v6.4s \n" // max 2648 "fmaxv %s3, v5.4s \n" // signed max acculator 2649 : "+r"(src), // %0 2650 "+r"(dst), // %1 2651 "+r"(width), // %2 2652 "=w"(fmax) // %3 2653 : "w"(scale) // %4 2654 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); 2655 return fmax; 2656 } 2657 2658 float ScaleSumSamples_NEON(const float* src, 2659 float* dst, 2660 float scale, 2661 int width) { 2662 float fsum; 2663 asm volatile( 2664 "movi v5.4s, #0 \n" // max 2665 "movi v6.4s, #0 \n" // max 2666 2667 "1: \n" 2668 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2669 "subs %w2, %w2, #8 \n" // 8 processed per loop 2670 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale 2671 "fmul v4.4s, v2.4s, %4.s[0] \n" 2672 "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares 2673 "fmla v6.4s, v2.4s, v2.4s \n" 2674 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples 2675 "b.gt 1b \n" 2676 "faddp v5.4s, v5.4s, v6.4s \n" 2677 "faddp v5.4s, v5.4s, v5.4s \n" 2678 "faddp %3.4s, v5.4s, v5.4s \n" // sum 2679 : "+r"(src), // %0 2680 "+r"(dst), // %1 2681 "+r"(width), // %2 2682 "=w"(fsum) // %3 2683 : "w"(scale) // %4 2684 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); 2685 return fsum; 2686 } 2687 2688 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { 2689 asm volatile( 2690 "1: \n" 2691 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples 2692 "subs %w2, %w2, #8 \n" // 8 processed per loop 2693 "fmul v1.4s, v1.4s, %3.s[0] \n" // scale 2694 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale 2695 "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples 2696 "b.gt 1b \n" 2697 : "+r"(src), // %0 2698 "+r"(dst), // %1 2699 "+r"(width) // %2 2700 : "w"(scale) // %3 2701 : "cc", "memory", "v1", "v2"); 2702 } 2703 2704 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2705 void GaussCol_NEON(const uint16* src0, 2706 const uint16* src1, 2707 const uint16* src2, 2708 const uint16* src3, 2709 const uint16* src4, 2710 uint32* dst, 2711 int width) { 2712 asm volatile( 2713 "movi v6.8h, #4 \n" // constant 4 2714 "movi v7.8h, #6 \n" // constant 6 2715 2716 "1: \n" 2717 "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows 2718 "ld1 {v2.8h}, [%4], #16 \n" 2719 "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 2720 "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 2721 "ld1 {v2.8h}, [%1], #16 \n" 2722 "umlal v0.4s, v2.4h, v6.4h \n" // * 4 2723 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 2724 "ld1 {v2.8h}, [%2], #16 \n" 2725 "umlal v0.4s, v2.4h, v7.4h \n" // * 6 2726 "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 2727 "ld1 {v2.8h}, [%3], #16 \n" 2728 "umlal v0.4s, v2.4h, v6.4h \n" // * 4 2729 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 2730 "subs %w6, %w6, #8 \n" // 8 processed per loop 2731 "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples 2732 "b.gt 1b \n" 2733 : "+r"(src0), // %0 2734 "+r"(src1), // %1 2735 "+r"(src2), // %2 2736 "+r"(src3), // %3 2737 "+r"(src4), // %4 2738 "+r"(dst), // %5 2739 "+r"(width) // %6 2740 : 2741 : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); 2742 } 2743 2744 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. 2745 void GaussRow_NEON(const uint32* src, uint16* dst, int width) { 2746 const uint32* src1 = src + 1; 2747 const uint32* src2 = src + 2; 2748 const uint32* src3 = src + 3; 2749 asm volatile( 2750 "movi v6.4s, #4 \n" // constant 4 2751 "movi v7.4s, #6 \n" // constant 6 2752 2753 "1: \n" 2754 "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples 2755 "add v0.4s, v0.4s, v1.4s \n" // * 1 2756 "add v1.4s, v1.4s, v2.4s \n" // * 1 2757 "ld1 {v2.4s,v3.4s}, [%2], #32 \n" 2758 "mla v0.4s, v2.4s, v7.4s \n" // * 6 2759 "mla v1.4s, v3.4s, v7.4s \n" // * 6 2760 "ld1 {v2.4s,v3.4s}, [%1], #32 \n" 2761 "ld1 {v4.4s,v5.4s}, [%3], #32 \n" 2762 "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 2763 "add v3.4s, v3.4s, v5.4s \n" 2764 "mla v0.4s, v2.4s, v6.4s \n" // * 4 2765 "mla v1.4s, v3.4s, v6.4s \n" // * 4 2766 "subs %w5, %w5, #8 \n" // 8 processed per loop 2767 "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack 2768 "uqrshrn2 v0.8h, v1.4s, #8 \n" 2769 "st1 {v0.8h}, [%4], #16 \n" // store 8 samples 2770 "b.gt 1b \n" 2771 : "+r"(src), // %0 2772 "+r"(src1), // %1 2773 "+r"(src2), // %2 2774 "+r"(src3), // %3 2775 "+r"(dst), // %4 2776 "+r"(width) // %5 2777 : "r"(32LL) // %6 2778 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2779 } 2780 2615 2781 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2616 2782
Note: See TracChangeset
for help on using the changeset viewer.