Changeset 5699 for pjproject/trunk/third_party/yuv/source/scale_neon.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/scale_neon.cc
r5633 r5699 30 30 (void)src_stride; 31 31 asm volatile( 32 "1: 32 "1: \n" 33 33 // load even pixels into q0, odd into q1 34 34 "vld2.8 {q0, q1}, [%0]! \n" … … 51 51 (void)src_stride; 52 52 asm volatile( 53 "1: \n" 54 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post 55 // inc 53 "1: \n" 54 "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels 56 55 "subs %2, %2, #16 \n" // 16 processed per loop 57 "vpaddl.u8 q0, q0 \n" // add adjacent 58 "vpaddl.u8 q1, q1 \n" 59 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and 60 // pack 61 "vrshrn.u16 d1, q1, #1 \n" 56 "vrhadd.u8 q0, q0, q1 \n" // rounding half add 62 57 "vst1.8 {q0}, [%1]! \n" 63 58 "bgt 1b \n" … … 78 73 // change the stride to row 2 pointer 79 74 "add %1, %0 \n" 80 "1: 75 "1: \n" 81 76 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 82 77 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc … … 107 102 (void)src_stride; 108 103 asm volatile( 109 "1: 104 "1: \n" 110 105 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 111 106 "subs %2, %2, #8 \n" // 8 processed per loop … … 127 122 const uint8* src_ptr3 = src_ptr + src_stride * 3; 128 123 asm volatile( 129 "1: 124 "1: \n" 130 125 "vld1.8 {q0}, [%0]! \n" // load up 16x4 131 126 "vld1.8 {q1}, [%3]! \n" … … 161 156 (void)src_stride; 162 157 asm volatile( 163 "1: 164 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0165 "subs %2, %2, #24 \n"166 "vmov d2, d3 \n" // order d0, d1, d2167 "vst3.8 {d0, d1, d2}, [%1]! \n"168 "bgt 1b \n"158 "1: \n" 159 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 160 "subs %2, %2, #24 \n" 161 "vmov d2, d3 \n" // order d0, d1, d2 162 "vst3.8 {d0, d1, d2}, [%1]! \n" 163 "bgt 1b \n" 169 164 : "+r"(src_ptr), // %0 170 165 "+r"(dst_ptr), // %1 … … 181 176 "vmov.u8 d24, #3 \n" 182 177 "add %3, %0 \n" 183 "1: 178 "1: \n" 184 179 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 185 180 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 … … 238 233 "vmov.u8 d24, #3 \n" 239 234 "add %3, %0 \n" 240 "1: 235 "1: \n" 241 236 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 242 237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 … … 286 281 asm volatile( 287 282 "vld1.8 {q3}, [%3] \n" 288 "1: 283 "1: \n" 289 284 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" 290 285 "subs %2, %2, #12 \n" … … 313 308 "vld1.8 {q15}, [%7] \n" 314 309 "add %3, %0 \n" 315 "1: 310 "1: \n" 316 311 317 312 // d0 = 00 40 01 41 02 42 03 43 … … 422 417 "vld1.8 {q14}, [%5] \n" 423 418 "add %3, %0 \n" 424 "1: 419 "1: \n" 425 420 426 421 // d0 = 00 40 01 41 02 42 03 43 … … 514 509 const uint8* src_tmp; 515 510 asm volatile( 516 "1: 511 "1: \n" 517 512 "mov %0, %1 \n" 518 513 "mov r12, %5 \n" 519 514 "veor q2, q2, q2 \n" 520 515 "veor q3, q3, q3 \n" 521 "2: 516 "2: \n" 522 517 // load 16 pixels into q0 523 518 "vld1.8 {q0}, [%0], %3 \n" … … 541 536 } 542 537 543 // clang-format off544 538 // TODO(Yang Zhang): Investigate less load instructions for 545 539 // the x/dx stepping 546 #define LOAD2_DATA8_LANE(n) \ 547 "lsr %5, %3, #16 \n" \ 548 "add %6, %1, %5 \n" \ 549 "add %3, %3, %4 \n" \ 550 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" 551 // clang-format on 540 #define LOAD2_DATA8_LANE(n) \ 541 "lsr %5, %3, #16 \n" \ 542 "add %6, %1, %5 \n" \ 543 "add %3, %3, %4 \n" \ 544 "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" 552 545 553 546 // The NEON version mimics this formula (from row_common.cc): … … 640 633 "vdup.8 d4, %4 \n" 641 634 // General purpose row blend. 642 "1: 635 "1: \n" 643 636 "vld1.8 {q0}, [%1]! \n" 644 637 "vld1.8 {q1}, [%2]! \n" … … 655 648 656 649 // Blend 25 / 75. 657 "25: 650 "25: \n" 658 651 "vld1.8 {q0}, [%1]! \n" 659 652 "vld1.8 {q1}, [%2]! \n" … … 666 659 667 660 // Blend 50 / 50. 668 "50: 661 "50: \n" 669 662 "vld1.8 {q0}, [%1]! \n" 670 663 "vld1.8 {q1}, [%2]! \n" … … 676 669 677 670 // Blend 75 / 25. 678 "75: 671 "75: \n" 679 672 "vld1.8 {q1}, [%1]! \n" 680 673 "vld1.8 {q0}, [%2]! \n" … … 687 680 688 681 // Blend 100 / 0 - Copy row unchanged. 689 "100: 682 "100: \n" 690 683 "vld1.8 {q0}, [%1]! \n" 691 684 "subs %3, %3, #16 \n" … … 693 686 "bgt 100b \n" 694 687 695 "99: 688 "99: \n" 696 689 "vst1.8 {d1[7]}, [%0] \n" 697 690 : "+r"(dst_ptr), // %0 … … 710 703 (void)src_stride; 711 704 asm volatile( 712 "1: \n" 713 // load even pixels into q0, odd into q1 714 "vld2.32 {q0, q1}, [%0]! \n" 715 "vld2.32 {q2, q3}, [%0]! \n" 705 "1: \n" 706 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 707 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 716 708 "subs %2, %2, #8 \n" // 8 processed per loop 717 "v st1.8 {q1}, [%1]! \n" // store odd pixels718 "vst 1.8 {q3}, [%1]! \n"709 "vmov q2, q1 \n" // load next 8 ARGB 710 "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels 719 711 "bgt 1b \n" 720 712 : "+r"(src_ptr), // %0 … … 725 717 ); 726 718 } 719 720 // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! 721 // 4a: 3e04 subs r6, #4 722 // 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! 723 // 50: ef64 21f4 vorr q9, q10, q10 724 // 54: f942 038d vst2.32 {d16-d19}, [r2]! 725 // 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46> 727 726 728 727 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, … … 732 731 (void)src_stride; 733 732 asm volatile( 734 "1: \n" 735 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 736 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 737 // pixels. 733 "1: \n" 734 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 735 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 738 736 "subs %2, %2, #8 \n" // 8 processed per loop 739 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 740 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 741 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 742 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 743 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and 744 // pack 745 "vrshrn.u16 d1, q1, #1 \n" 746 "vrshrn.u16 d2, q2, #1 \n" 747 "vrshrn.u16 d3, q3, #1 \n" 748 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" 737 "vrhadd.u8 q0, q0, q1 \n" // rounding half add 738 "vrhadd.u8 q1, q2, q3 \n" // rounding half add 739 "vst2.32 {q0, q1}, [%1]! \n" 749 740 "bgt 1b \n" 750 741 : "+r"(src_argb), // %0 … … 763 754 // change the stride to row 2 pointer 764 755 "add %1, %1, %0 \n" 765 "1: 756 "1: \n" 766 757 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 767 758 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB 768 // pixels.769 759 "subs %3, %3, #8 \n" // 8 processed per loop. 770 760 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. … … 773 763 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 774 764 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB 775 // pixels.776 765 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB 777 // pixels.778 766 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. 779 767 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. 780 768 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. 781 769 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. 782 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and 783 // pack 770 "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes 784 771 "vrshrn.u16 d1, q1, #2 \n" 785 772 "vrshrn.u16 d2, q2, #2 \n" … … 805 792 asm volatile( 806 793 "mov r12, %3, lsl #2 \n" 807 "1: 794 "1: \n" 808 795 "vld1.32 {d0[0]}, [%0], r12 \n" 809 796 "vld1.32 {d0[1]}, [%0], r12 \n" … … 830 817 "mov r12, %4, lsl #2 \n" 831 818 "add %1, %1, %0 \n" 832 "1: \n" 833 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 834 // 2x1 819 "1: \n" 820 "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 835 821 "vld1.8 {d1}, [%1], r12 \n" 836 822 "vld1.8 {d2}, [%0], r12 \n" … … 861 847 } 862 848 863 // clang-format off864 849 // TODO(Yang Zhang): Investigate less load instructions for 865 850 // the x/dx stepping 866 #define LOAD1_DATA32_LANE(dn, n) 867 "lsr %5, %3, #16 \n" 868 "add %6, %1, %5, lsl #2 \n" 869 "add %3, %3, %4 \n" 851 #define LOAD1_DATA32_LANE(dn, n) \ 852 "lsr %5, %3, #16 \n" \ 853 "add %6, %1, %5, lsl #2 \n" \ 854 "add %3, %3, %4 \n" \ 870 855 "vld1.32 {" #dn "[" #n "]}, [%6] \n" 871 // clang-format on872 856 873 857 void ScaleARGBCols_NEON(uint8* dst_argb, … … 879 863 const uint8* src_tmp = src_argb; 880 864 asm volatile( 881 "1: \n" LOAD1_DATA32_LANE( 882 d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0) 883 LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE( 884 d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1) 885 886 "vst1.32 {q0, q1}, [%0]! \n" // store pixels 887 "subs %2, %2, #8 \n" // 8 processed per 888 // loop 889 "bgt 1b \n" 865 "1: \n" 866 // clang-format off 867 LOAD1_DATA32_LANE(d0, 0) 868 LOAD1_DATA32_LANE(d0, 1) 869 LOAD1_DATA32_LANE(d1, 0) 870 LOAD1_DATA32_LANE(d1, 1) 871 LOAD1_DATA32_LANE(d2, 0) 872 LOAD1_DATA32_LANE(d2, 1) 873 LOAD1_DATA32_LANE(d3, 0) 874 LOAD1_DATA32_LANE(d3, 1) 875 // clang-format on 876 "vst1.32 {q0, q1}, [%0]! \n" // store pixels 877 "subs %2, %2, #8 \n" // 8 processed per loop 878 "bgt 1b \n" 890 879 : "+r"(dst_argb), // %0 891 880 "+r"(src_argb), // %1 … … 901 890 #undef LOAD1_DATA32_LANE 902 891 903 // clang-format off904 892 // TODO(Yang Zhang): Investigate less load instructions for 905 893 // the x/dx stepping 906 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ 907 "lsr %5, %3, #16 \n" \ 908 "add %6, %1, %5, lsl #2 \n" \ 909 "add %3, %3, %4 \n" \ 910 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" 911 // clang-format on 894 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ 895 "lsr %5, %3, #16 \n" \ 896 "add %6, %1, %5, lsl #2 \n" \ 897 "add %3, %3, %4 \n" \ 898 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" 912 899 913 900 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
Note: See TracChangeset
for help on using the changeset viewer.