Changeset 5699 for pjproject/trunk/third_party/yuv/source/scale_msa.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/scale_msa.cc
r5633 r5699 21 21 extern "C" { 22 22 #endif 23 24 #define LOAD_INDEXED_DATA(srcp, indx0, out0) \ 25 { \ 26 out0[0] = srcp[indx0[0]]; \ 27 out0[1] = srcp[indx0[1]]; \ 28 out0[2] = srcp[indx0[2]]; \ 29 out0[3] = srcp[indx0[3]]; \ 30 } 23 31 24 32 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, … … 546 554 } 547 555 556 void ScaleFilterCols_MSA(uint8* dst_ptr, 557 const uint8* src_ptr, 558 int dst_width, 559 int x, 560 int dx) { 561 int j; 562 v4i32 vec_x = __msa_fill_w(x); 563 v4i32 vec_dx = __msa_fill_w(dx); 564 v4i32 vec_const = {0, 1, 2, 3}; 565 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 566 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 567 v8u16 reg0, reg1; 568 v16u8 dst0; 569 v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); 570 v4i32 const_0x40 = __msa_fill_w(0x40); 571 572 vec0 = vec_dx * vec_const; 573 vec1 = vec_dx * 4; 574 vec_x += vec0; 575 576 for (j = 0; j < dst_width - 1; j += 16) { 577 vec2 = vec_x >> 16; 578 vec6 = vec_x & const_0xFFFF; 579 vec_x += vec1; 580 vec3 = vec_x >> 16; 581 vec7 = vec_x & const_0xFFFF; 582 vec_x += vec1; 583 vec4 = vec_x >> 16; 584 vec8 = vec_x & const_0xFFFF; 585 vec_x += vec1; 586 vec5 = vec_x >> 16; 587 vec9 = vec_x & const_0xFFFF; 588 vec_x += vec1; 589 vec6 >>= 9; 590 vec7 >>= 9; 591 vec8 >>= 9; 592 vec9 >>= 9; 593 LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); 594 LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); 595 LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); 596 LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); 597 vec2 += 1; 598 vec3 += 1; 599 vec4 += 1; 600 vec5 += 1; 601 LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); 602 LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); 603 LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); 604 LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); 605 tmp4 -= tmp0; 606 tmp5 -= tmp1; 607 tmp6 -= tmp2; 608 tmp7 -= tmp3; 609 tmp4 *= vec6; 610 tmp5 *= vec7; 611 tmp6 *= vec8; 612 tmp7 *= vec9; 613 tmp4 += const_0x40; 614 tmp5 += const_0x40; 615 tmp6 += const_0x40; 616 tmp7 += const_0x40; 617 tmp4 >>= 7; 618 tmp5 >>= 7; 619 tmp6 >>= 7; 620 tmp7 >>= 7; 621 tmp0 += tmp4; 622 tmp1 += tmp5; 623 tmp2 += tmp6; 624 tmp3 += tmp7; 625 reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 626 reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 627 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 628 __msa_st_b(dst0, dst_ptr, 0); 629 dst_ptr += 16; 630 } 631 } 632 633 void ScaleARGBCols_MSA(uint8* dst_argb, 634 const uint8* src_argb, 635 int dst_width, 636 int x, 637 int dx) { 638 const uint32* src = (const uint32*)(src_argb); 639 uint32* dst = (uint32*)(dst_argb); 640 int j; 641 v4i32 x_vec = __msa_fill_w(x); 642 v4i32 dx_vec = __msa_fill_w(dx); 643 v4i32 const_vec = {0, 1, 2, 3}; 644 v4i32 vec0, vec1, vec2; 645 v4i32 dst0; 646 647 vec0 = dx_vec * const_vec; 648 vec1 = dx_vec * 4; 649 x_vec += vec0; 650 651 for (j = 0; j < dst_width; j += 4) { 652 vec2 = x_vec >> 16; 653 x_vec += vec1; 654 LOAD_INDEXED_DATA(src, vec2, dst0); 655 __msa_st_w(dst0, dst, 0); 656 dst += 4; 657 } 658 } 659 660 void ScaleARGBFilterCols_MSA(uint8* dst_argb, 661 const uint8* src_argb, 662 int dst_width, 663 int x, 664 int dx) { 665 const uint32* src = (const uint32*)(src_argb); 666 int j; 667 v4u32 src0, src1, src2, src3; 668 v4u32 vec0, vec1, vec2, vec3; 669 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 670 v16u8 mult0, mult1, mult2, mult3; 671 v8u16 tmp0, tmp1, tmp2, tmp3; 672 v16u8 dst0, dst1; 673 v4u32 vec_x = (v4u32)__msa_fill_w(x); 674 v4u32 vec_dx = (v4u32)__msa_fill_w(dx); 675 v4u32 vec_const = {0, 1, 2, 3}; 676 v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); 677 678 vec0 = vec_dx * vec_const; 679 vec1 = vec_dx * 4; 680 vec_x += vec0; 681 682 for (j = 0; j < dst_width - 1; j += 8) { 683 vec2 = vec_x >> 16; 684 reg0 = (v16u8)(vec_x >> 9); 685 vec_x += vec1; 686 vec3 = vec_x >> 16; 687 reg1 = (v16u8)(vec_x >> 9); 688 vec_x += vec1; 689 reg0 = reg0 & const_0x7f; 690 reg1 = reg1 & const_0x7f; 691 reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); 692 reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); 693 reg2 = reg0 ^ const_0x7f; 694 reg3 = reg1 ^ const_0x7f; 695 mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); 696 mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); 697 mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); 698 mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); 699 LOAD_INDEXED_DATA(src, vec2, src0); 700 LOAD_INDEXED_DATA(src, vec3, src1); 701 vec2 += 1; 702 vec3 += 1; 703 LOAD_INDEXED_DATA(src, vec2, src2); 704 LOAD_INDEXED_DATA(src, vec3, src3); 705 reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 706 reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 707 reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 708 reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 709 tmp0 = __msa_dotp_u_h(reg4, mult0); 710 tmp1 = __msa_dotp_u_h(reg5, mult1); 711 tmp2 = __msa_dotp_u_h(reg6, mult2); 712 tmp3 = __msa_dotp_u_h(reg7, mult3); 713 tmp0 >>= 7; 714 tmp1 >>= 7; 715 tmp2 >>= 7; 716 tmp3 >>= 7; 717 dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 718 dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 719 __msa_st_b(dst0, dst_argb, 0); 720 __msa_st_b(dst1, dst_argb, 16); 721 dst_argb += 32; 722 } 723 } 724 725 void ScaleRowDown34_MSA(const uint8* src_ptr, 726 ptrdiff_t src_stride, 727 uint8* dst, 728 int dst_width) { 729 int x; 730 (void)src_stride; 731 v16u8 src0, src1, src2, src3; 732 v16u8 vec0, vec1, vec2; 733 v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; 734 v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; 735 v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, 736 21, 23, 24, 25, 27, 28, 29, 31}; 737 738 assert((dst_width % 3 == 0) && (dst_width > 0)); 739 740 for (x = 0; x < dst_width; x += 48) { 741 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 742 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 743 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); 744 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); 745 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); 746 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); 747 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); 748 __msa_st_b((v16i8)vec0, dst, 0); 749 __msa_st_b((v16i8)vec1, dst, 16); 750 __msa_st_b((v16i8)vec2, dst, 32); 751 src_ptr += 64; 752 dst += 48; 753 } 754 } 755 756 void ScaleRowDown34_0_Box_MSA(const uint8* src_ptr, 757 ptrdiff_t src_stride, 758 uint8* d, 759 int dst_width) { 760 const uint8* s = src_ptr; 761 const uint8* t = src_ptr + src_stride; 762 int x; 763 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; 764 v16u8 vec0, vec1, vec2, vec3, vec4, vec5; 765 v16u8 vec6, vec7, vec8, vec9, vec10, vec11; 766 v8i16 reg0, reg1, reg2, reg3, reg4, reg5; 767 v8i16 reg6, reg7, reg8, reg9, reg10, reg11; 768 v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; 769 v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; 770 v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; 771 v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 772 v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 773 16, 17, 17, 18, 18, 19, 20, 21}; 774 v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; 775 v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; 776 v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; 777 v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; 778 779 assert((dst_width % 3 == 0) && (dst_width > 0)); 780 781 for (x = 0; x < dst_width; x += 48) { 782 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 783 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 784 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 785 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 786 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 787 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 788 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 789 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 790 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); 791 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 792 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); 793 vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); 794 vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 795 vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); 796 vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); 797 vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 798 vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); 799 vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); 800 vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); 801 vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); 802 reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); 803 reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); 804 reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); 805 reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); 806 reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); 807 reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); 808 reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); 809 reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); 810 reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); 811 reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); 812 reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); 813 reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); 814 reg0 = __msa_srar_h(reg0, shft0); 815 reg1 = __msa_srar_h(reg1, shft1); 816 reg2 = __msa_srar_h(reg2, shft2); 817 reg3 = __msa_srar_h(reg3, shft0); 818 reg4 = __msa_srar_h(reg4, shft1); 819 reg5 = __msa_srar_h(reg5, shft2); 820 reg6 = __msa_srar_h(reg6, shft0); 821 reg7 = __msa_srar_h(reg7, shft1); 822 reg8 = __msa_srar_h(reg8, shft2); 823 reg9 = __msa_srar_h(reg9, shft0); 824 reg10 = __msa_srar_h(reg10, shft1); 825 reg11 = __msa_srar_h(reg11, shft2); 826 reg0 = reg0 * 3 + reg6; 827 reg1 = reg1 * 3 + reg7; 828 reg2 = reg2 * 3 + reg8; 829 reg3 = reg3 * 3 + reg9; 830 reg4 = reg4 * 3 + reg10; 831 reg5 = reg5 * 3 + reg11; 832 reg0 = __msa_srari_h(reg0, 2); 833 reg1 = __msa_srari_h(reg1, 2); 834 reg2 = __msa_srari_h(reg2, 2); 835 reg3 = __msa_srari_h(reg3, 2); 836 reg4 = __msa_srari_h(reg4, 2); 837 reg5 = __msa_srari_h(reg5, 2); 838 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 839 dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); 840 dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 841 __msa_st_b((v16i8)dst0, d, 0); 842 __msa_st_b((v16i8)dst1, d, 16); 843 __msa_st_b((v16i8)dst2, d, 32); 844 s += 64; 845 t += 64; 846 d += 48; 847 } 848 } 849 850 void ScaleRowDown34_1_Box_MSA(const uint8* src_ptr, 851 ptrdiff_t src_stride, 852 uint8* d, 853 int dst_width) { 854 const uint8* s = src_ptr; 855 const uint8* t = src_ptr + src_stride; 856 int x; 857 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; 858 v16u8 vec0, vec1, vec2, vec3, vec4, vec5; 859 v16u8 vec6, vec7, vec8, vec9, vec10, vec11; 860 v8i16 reg0, reg1, reg2, reg3, reg4, reg5; 861 v8i16 reg6, reg7, reg8, reg9, reg10, reg11; 862 v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; 863 v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; 864 v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; 865 v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 866 v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 867 16, 17, 17, 18, 18, 19, 20, 21}; 868 v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; 869 v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; 870 v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; 871 v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; 872 873 assert((dst_width % 3 == 0) && (dst_width > 0)); 874 875 for (x = 0; x < dst_width; x += 48) { 876 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 877 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 878 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 879 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 880 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 881 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 882 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 883 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 884 vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); 885 vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 886 vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); 887 vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); 888 vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); 889 vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); 890 vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); 891 vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); 892 vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); 893 vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); 894 vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); 895 vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); 896 reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); 897 reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); 898 reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); 899 reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); 900 reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); 901 reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); 902 reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); 903 reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); 904 reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); 905 reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); 906 reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); 907 reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); 908 reg0 = __msa_srar_h(reg0, shft0); 909 reg1 = __msa_srar_h(reg1, shft1); 910 reg2 = __msa_srar_h(reg2, shft2); 911 reg3 = __msa_srar_h(reg3, shft0); 912 reg4 = __msa_srar_h(reg4, shft1); 913 reg5 = __msa_srar_h(reg5, shft2); 914 reg6 = __msa_srar_h(reg6, shft0); 915 reg7 = __msa_srar_h(reg7, shft1); 916 reg8 = __msa_srar_h(reg8, shft2); 917 reg9 = __msa_srar_h(reg9, shft0); 918 reg10 = __msa_srar_h(reg10, shft1); 919 reg11 = __msa_srar_h(reg11, shft2); 920 reg0 += reg6; 921 reg1 += reg7; 922 reg2 += reg8; 923 reg3 += reg9; 924 reg4 += reg10; 925 reg5 += reg11; 926 reg0 = __msa_srari_h(reg0, 1); 927 reg1 = __msa_srari_h(reg1, 1); 928 reg2 = __msa_srari_h(reg2, 1); 929 reg3 = __msa_srari_h(reg3, 1); 930 reg4 = __msa_srari_h(reg4, 1); 931 reg5 = __msa_srari_h(reg5, 1); 932 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 933 dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); 934 dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 935 __msa_st_b((v16i8)dst0, d, 0); 936 __msa_st_b((v16i8)dst1, d, 16); 937 __msa_st_b((v16i8)dst2, d, 32); 938 s += 64; 939 t += 64; 940 d += 48; 941 } 942 } 943 548 944 #ifdef __cplusplus 549 945 } // extern "C"
Note: See TracChangeset
for help on using the changeset viewer.