Changeset 5633 for pjproject/trunk/third_party/yuv/source/scale_neon64.cc
- Timestamp:
- Jul 28, 2017 2:51:44 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/scale_neon64.cc
r5358 r5633 9 9 */ 10 10 11 #include "libyuv/row.h" 11 12 #include "libyuv/scale.h" 12 #include "libyuv/row.h"13 13 #include "libyuv/scale_row.h" 14 14 … … 22 22 23 23 // Read 32x1 throw away even pixels, and write 16x1. 24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 25 uint8* dst, int dst_width) { 24 void ScaleRowDown2_NEON(const uint8* src_ptr, 25 ptrdiff_t src_stride, 26 uint8* dst, 27 int dst_width) { 28 (void)src_stride; 26 29 asm volatile ( 27 30 "1: \n" 28 31 // load even pixels into v0, odd into v1 29 MEMACCESS(0)30 32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 31 33 "subs %w2, %w2, #16 \n" // 16 processed per loop 32 MEMACCESS(1)33 34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 34 35 "b.gt 1b \n" … … 42 43 43 44 // Read 32x1 average down and write 16x1. 44 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 45 uint8* dst, int dst_width) { 46 asm volatile ( 47 "1: \n" 48 MEMACCESS(0) 45 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, 46 ptrdiff_t src_stride, 47 uint8* dst, 48 int dst_width) { 49 (void)src_stride; 50 asm volatile ( 51 "1: \n" 49 52 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc 50 53 "subs %w2, %w2, #16 \n" // 16 processed per loop … … 53 56 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 54 57 "rshrn2 v0.16b, v1.8h, #1 \n" 55 MEMACCESS(1)56 58 "st1 {v0.16b}, [%1], #16 \n" 57 59 "b.gt 1b \n" … … 65 67 66 68 // Read 32x2 average down and write 16x1. 67 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 68 uint8* dst, int dst_width) { 69 void ScaleRowDown2Box_NEON(const uint8* src_ptr, 70 ptrdiff_t src_stride, 71 uint8* dst, 72 int dst_width) { 69 73 asm volatile ( 70 74 // change the stride to row 2 pointer 71 75 "add %1, %1, %0 \n" 72 76 "1: \n" 73 MEMACCESS(0) 74 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc 75 MEMACCESS(1) 77 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc 76 78 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc 77 79 "subs %w3, %w3, #16 \n" // 16 processed per loop … … 82 84 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 83 85 "rshrn2 v0.16b, v1.8h, #2 \n" 84 MEMACCESS(2)85 86 "st1 {v0.16b}, [%2], #16 \n" 86 87 "b.gt 1b \n" … … 94 95 } 95 96 96 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 97 uint8* dst_ptr, int dst_width) { 98 asm volatile ( 99 "1: \n" 100 MEMACCESS(0) 97 void ScaleRowDown4_NEON(const uint8* src_ptr, 98 ptrdiff_t src_stride, 99 uint8* dst_ptr, 100 int dst_width) { 101 (void)src_stride; 102 asm volatile ( 103 "1: \n" 101 104 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 102 105 "subs %w2, %w2, #8 \n" // 8 processed per loop 103 MEMACCESS(1)104 106 "st1 {v2.8b}, [%1], #8 \n" 105 107 "b.gt 1b \n" … … 112 114 } 113 115 114 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 115 uint8* dst_ptr, int dst_width) { 116 void ScaleRowDown4Box_NEON(const uint8* src_ptr, 117 ptrdiff_t src_stride, 118 uint8* dst_ptr, 119 int dst_width) { 116 120 const uint8* src_ptr1 = src_ptr + src_stride; 117 121 const uint8* src_ptr2 = src_ptr + src_stride * 2; 118 122 const uint8* src_ptr3 = src_ptr + src_stride * 3; 119 asm volatile ( 120 "1: \n" 121 MEMACCESS(0) 123 asm volatile ( 124 "1: \n" 122 125 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 123 MEMACCESS(3)124 126 "ld1 {v1.16b}, [%2], #16 \n" 125 MEMACCESS(4)126 127 "ld1 {v2.16b}, [%3], #16 \n" 127 MEMACCESS(5)128 128 "ld1 {v3.16b}, [%4], #16 \n" 129 129 "subs %w5, %w5, #4 \n" … … 134 134 "addp v0.8h, v0.8h, v0.8h \n" 135 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 136 MEMACCESS(1)137 136 "st1 {v0.s}[0], [%1], #4 \n" 138 137 "b.gt 1b \n" … … 153 152 void ScaleRowDown34_NEON(const uint8* src_ptr, 154 153 ptrdiff_t src_stride, 155 uint8* dst_ptr, int dst_width) { 154 uint8* dst_ptr, 155 int dst_width) { 156 (void)src_stride; 156 157 asm volatile ( 157 158 "1: \n" 158 MEMACCESS(0) 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 160 160 "subs %w2, %w2, #24 \n" 161 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 162 MEMACCESS(1) 163 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 162 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 164 163 "b.gt 1b \n" 165 164 : "+r"(src_ptr), // %0 … … 173 172 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 174 173 ptrdiff_t src_stride, 175 uint8* dst_ptr, int dst_width) { 174 uint8* dst_ptr, 175 int dst_width) { 176 176 asm volatile ( 177 177 "movi v20.8b, #3 \n" 178 178 "add %3, %3, %0 \n" 179 179 "1: \n" 180 MEMACCESS(0) 181 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 182 MEMACCESS(3) 183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 181 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 184 182 "subs %w2, %w2, #24 \n" 185 183 … … 217 215 "uqrshrn v2.8b, v16.8h, #2 \n" 218 216 219 MEMACCESS(1) 220 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 217 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 221 218 222 219 "b.gt 1b \n" … … 233 230 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, 234 231 ptrdiff_t src_stride, 235 uint8* dst_ptr, int dst_width) { 232 uint8* dst_ptr, 233 int dst_width) { 236 234 asm volatile ( 237 235 "movi v20.8b, #3 \n" 238 236 "add %3, %3, %0 \n" 239 237 "1: \n" 240 MEMACCESS(0) 241 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 242 MEMACCESS(3) 243 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 238 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 239 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 244 240 "subs %w2, %w2, #24 \n" 245 241 // average src line 0 with src line 1 … … 262 258 "uqrshrn v2.8b, v4.8h, #2 \n" 263 259 264 MEMACCESS(1) 265 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 260 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 266 261 "b.gt 1b \n" 267 262 : "+r"(src_ptr), // %0 … … 274 269 } 275 270 276 static uvec8 kShuf38 = 277 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 278 static uvec8 kShuf38_2 = 279 { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; 280 static vec16 kMult38_Div6 = 281 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 282 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 283 static vec16 kMult38_Div9 = 284 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 285 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 271 static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; 272 static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, 273 34, 6, 22, 35, 0, 0, 0, 0}; 274 static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 275 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; 276 static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 277 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; 286 278 287 279 // 32 -> 12 288 280 void ScaleRowDown38_NEON(const uint8* src_ptr, 289 281 ptrdiff_t src_stride, 290 uint8* dst_ptr, int dst_width) { 291 asm volatile ( 292 MEMACCESS(3) 282 uint8* dst_ptr, 283 int dst_width) { 284 (void)src_stride; 285 asm volatile ( 293 286 "ld1 {v3.16b}, [%3] \n" 294 287 "1: \n" 295 MEMACCESS(0) 296 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 288 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 297 289 "subs %w2, %w2, #12 \n" 298 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 299 MEMACCESS(1) 290 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 300 291 "st1 {v2.8b}, [%1], #8 \n" 301 MEMACCESS(1)302 292 "st1 {v2.s}[2], [%1], #4 \n" 303 293 "b.gt 1b \n" … … 313 303 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, 314 304 ptrdiff_t src_stride, 315 uint8* dst_ptr, int dst_width) { 305 uint8* dst_ptr, 306 int dst_width) { 316 307 const uint8* src_ptr1 = src_ptr + src_stride * 2; 317 308 ptrdiff_t tmp_src_stride = src_stride; 318 309 319 310 asm volatile ( 320 MEMACCESS(5)321 311 "ld1 {v29.8h}, [%5] \n" 322 MEMACCESS(6)323 312 "ld1 {v30.16b}, [%6] \n" 324 MEMACCESS(7)325 313 "ld1 {v31.8h}, [%7] \n" 326 314 "add %2, %2, %0 \n" … … 331 319 // 20 60 21 61 22 62 23 63 332 320 // 30 70 31 71 32 72 33 73 333 MEMACCESS(0) 334 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 335 MEMACCESS(3) 336 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 337 MEMACCESS(4) 338 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 322 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 323 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 339 324 "subs %w4, %w4, #12 \n" 340 325 … … 420 405 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 421 406 422 MEMACCESS(1)423 407 "st1 {v3.8b}, [%1], #8 \n" 424 MEMACCESS(1)425 408 "st1 {v3.s}[2], [%1], #4 \n" 426 409 "b.gt 1b \n" … … 442 425 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, 443 426 ptrdiff_t src_stride, 444 uint8* dst_ptr, int dst_width) { 427 uint8* dst_ptr, 428 int dst_width) { 445 429 // TODO(fbarchard): use src_stride directly for clang 3.5+. 446 430 ptrdiff_t tmp_src_stride = src_stride; 447 431 asm volatile ( 448 MEMACCESS(4)449 432 "ld1 {v30.8h}, [%4] \n" 450 MEMACCESS(5)451 433 "ld1 {v31.16b}, [%5] \n" 452 434 "add %2, %2, %0 \n" … … 457 439 // 20 60 21 61 22 62 23 63 458 440 // 30 70 31 71 32 72 33 73 459 MEMACCESS(0) 460 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 461 MEMACCESS(3) 462 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 442 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 463 443 "subs %w3, %w3, #12 \n" 464 444 … … 530 510 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 531 511 532 MEMACCESS(1)533 512 "st1 {v3.8b}, [%1], #8 \n" 534 MEMACCESS(1)535 513 "st1 {v3.s}[2], [%1], #4 \n" 536 514 "b.gt 1b \n" … … 546 524 } 547 525 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 549 uint16* dst_ptr, int src_width, int src_height) { 526 void ScaleAddRows_NEON(const uint8* src_ptr, 527 ptrdiff_t src_stride, 528 uint16* dst_ptr, 529 int src_width, 530 int src_height) { 550 531 const uint8* src_tmp; 551 532 asm volatile ( … … 557 538 "2: \n" 558 539 // load 16 pixels into q0 559 MEMACCESS(0)560 540 "ld1 {v0.16b}, [%0], %3 \n" 561 541 "uaddw2 v3.8h, v3.8h, v0.16b \n" … … 563 543 "subs w12, w12, #1 \n" 564 544 "b.gt 2b \n" 565 MEMACCESS(2)566 545 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 567 546 "add %1, %1, #16 \n" … … 579 558 } 580 559 560 // clang-format off 581 561 // TODO(Yang Zhang): Investigate less load instructions for 582 562 // the x/dx stepping 583 #define LOAD2_DATA8_LANE(n) \ 584 "lsr %5, %3, #16 \n" \ 585 "add %6, %1, %5 \n" \ 586 "add %3, %3, %4 \n" \ 587 MEMACCESS(6) \ 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" 589 590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, 591 int dst_width, int x, int dx) { 563 #define LOAD2_DATA8_LANE(n) \ 564 "lsr %5, %3, #16 \n" \ 565 "add %6, %1, %5 \n" \ 566 "add %3, %3, %4 \n" \ 567 "ld2 {v4.b, v5.b}[" #n "], [%6] \n" 568 // clang-format on 569 570 // The NEON version mimics this formula (from row_common.cc): 571 // #define BLENDER(a, b, f) (uint8)((int)(a) + 572 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) 573 574 void ScaleFilterCols_NEON(uint8* dst_ptr, 575 const uint8* src_ptr, 576 int dst_width, 577 int x, 578 int dx) { 592 579 int dx_offset[4] = {0, 1, 2, 3}; 593 580 int* tmp = dx_offset; 594 581 const uint8* src_tmp = src_ptr; 595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 596 int64 x64 = (int64) x; 597 int64 dx64 = (int64) dx; 582 int64 x64 = (int64)x; 583 int64 dx64 = (int64)dx; 598 584 asm volatile ( 599 585 "dup v0.4s, %w3 \n" // x … … 627 613 "mul v16.4s, v16.4s, v7.4s \n" 628 614 "mul v17.4s, v17.4s, v6.4s \n" 629 " shrnv6.4h, v16.4s, #16 \n"630 " shrn2v6.8h, v17.4s, #16 \n"615 "rshrn v6.4h, v16.4s, #16 \n" 616 "rshrn2 v6.8h, v17.4s, #16 \n" 631 617 "add v4.8h, v4.8h, v6.8h \n" 632 618 "xtn v4.8b, v4.8h \n" 633 619 634 MEMACCESS(0)635 620 "st1 {v4.8b}, [%0], #8 \n" // store pixels 636 621 "add v1.4s, v1.4s, v0.4s \n" … … 640 625 : "+r"(dst_ptr), // %0 641 626 "+r"(src_ptr), // %1 642 "+r"(dst_width 64),// %2627 "+r"(dst_width), // %2 643 628 "+r"(x64), // %3 644 629 "+r"(dx64), // %4 … … 655 640 // 16x2 -> 16x1 656 641 void ScaleFilterRows_NEON(uint8* dst_ptr, 657 const uint8* src_ptr, ptrdiff_t src_stride, 658 int dst_width, int source_y_fraction) { 659 int y_fraction = 256 - source_y_fraction; 642 const uint8* src_ptr, 643 ptrdiff_t src_stride, 644 int dst_width, 645 int source_y_fraction) { 646 int y_fraction = 256 - source_y_fraction; 660 647 asm volatile ( 661 648 "cmp %w4, #0 \n" … … 673 660 // General purpose row blend. 674 661 "1: \n" 675 MEMACCESS(1)676 662 "ld1 {v0.16b}, [%1], #16 \n" 677 MEMACCESS(2)678 663 "ld1 {v1.16b}, [%2], #16 \n" 679 664 "subs %w3, %w3, #16 \n" … … 684 669 "rshrn v0.8b, v6.8h, #8 \n" 685 670 "rshrn2 v0.16b, v7.8h, #8 \n" 686 MEMACCESS(0)687 671 "st1 {v0.16b}, [%0], #16 \n" 688 672 "b.gt 1b \n" … … 691 675 // Blend 25 / 75. 692 676 "25: \n" 693 MEMACCESS(1)694 677 "ld1 {v0.16b}, [%1], #16 \n" 695 MEMACCESS(2)696 678 "ld1 {v1.16b}, [%2], #16 \n" 697 679 "subs %w3, %w3, #16 \n" 698 680 "urhadd v0.16b, v0.16b, v1.16b \n" 699 681 "urhadd v0.16b, v0.16b, v1.16b \n" 700 MEMACCESS(0)701 682 "st1 {v0.16b}, [%0], #16 \n" 702 683 "b.gt 25b \n" … … 705 686 // Blend 50 / 50. 706 687 "50: \n" 707 MEMACCESS(1)708 688 "ld1 {v0.16b}, [%1], #16 \n" 709 MEMACCESS(2)710 689 "ld1 {v1.16b}, [%2], #16 \n" 711 690 "subs %w3, %w3, #16 \n" 712 691 "urhadd v0.16b, v0.16b, v1.16b \n" 713 MEMACCESS(0)714 692 "st1 {v0.16b}, [%0], #16 \n" 715 693 "b.gt 50b \n" … … 718 696 // Blend 75 / 25. 719 697 "75: \n" 720 MEMACCESS(1)721 698 "ld1 {v1.16b}, [%1], #16 \n" 722 MEMACCESS(2)723 699 "ld1 {v0.16b}, [%2], #16 \n" 724 700 "subs %w3, %w3, #16 \n" 725 701 "urhadd v0.16b, v0.16b, v1.16b \n" 726 702 "urhadd v0.16b, v0.16b, v1.16b \n" 727 MEMACCESS(0)728 703 "st1 {v0.16b}, [%0], #16 \n" 729 704 "b.gt 75b \n" … … 732 707 // Blend 100 / 0 - Copy row unchanged. 733 708 "100: \n" 734 MEMACCESS(1)735 709 "ld1 {v0.16b}, [%1], #16 \n" 736 710 "subs %w3, %w3, #16 \n" 737 MEMACCESS(0)738 711 "st1 {v0.16b}, [%0], #16 \n" 739 712 "b.gt 100b \n" 740 713 741 714 "99: \n" 742 MEMACCESS(0)743 715 "st1 {v0.b}[15], [%0] \n" 744 716 : "+r"(dst_ptr), // %0 … … 753 725 } 754 726 755 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 756 uint8* dst, int dst_width) { 727 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, 728 ptrdiff_t src_stride, 729 uint8* dst, 730 int dst_width) { 731 (void)src_stride; 757 732 asm volatile ( 758 733 "1: \n" 759 734 // load even pixels into q0, odd into q1 760 MEMACCESS (0)761 735 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" 762 MEMACCESS (0)763 736 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" 764 737 "subs %w2, %w2, #8 \n" // 8 processed per loop 765 MEMACCESS (1)766 738 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 767 MEMACCESS (1)768 739 "st1 {v3.16b}, [%1], #16 \n" 769 740 "b.gt 1b \n" … … 776 747 } 777 748 778 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, 779 uint8* dst_argb, int dst_width) { 780 asm volatile ( 781 "1: \n" 782 MEMACCESS (0) 749 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, 750 ptrdiff_t src_stride, 751 uint8* dst_argb, 752 int dst_width) { 753 (void)src_stride; 754 asm volatile ( 755 "1: \n" 783 756 // load 8 ARGB pixels. 784 757 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" … … 792 765 "rshrn v2.8b, v2.8h, #1 \n" 793 766 "rshrn v3.8b, v3.8h, #1 \n" 794 MEMACCESS (1)795 767 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" 796 768 "b.gt 1b \n" … … 803 775 } 804 776 805 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 806 uint8* dst, int dst_width) { 777 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, 778 ptrdiff_t src_stride, 779 uint8* dst, 780 int dst_width) { 807 781 asm volatile ( 808 782 // change the stride to row 2 pointer 809 783 "add %1, %1, %0 \n" 810 784 "1: \n" 811 MEMACCESS (0)812 785 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. 813 786 "subs %w3, %w3, #8 \n" // 8 processed per loop. … … 816 789 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 817 790 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 818 MEMACCESS (1)819 791 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. 820 792 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. … … 826 798 "rshrn v2.8b, v2.8h, #2 \n" 827 799 "rshrn v3.8b, v3.8h, #2 \n" 828 MEMACCESS (2)829 800 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 830 801 "b.gt 1b \n" … … 840 811 // Reads 4 pixels at a time. 841 812 // Alignment requirement: src_argb 4 byte aligned. 842 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, 843 int src_stepx, uint8* dst_argb, int dst_width) { 844 asm volatile ( 845 "1: \n" 846 MEMACCESS(0) 813 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, 814 ptrdiff_t src_stride, 815 int src_stepx, 816 uint8* dst_argb, 817 int dst_width) { 818 (void)src_stride; 819 asm volatile ( 820 "1: \n" 847 821 "ld1 {v0.s}[0], [%0], %3 \n" 848 MEMACCESS(0)849 822 "ld1 {v0.s}[1], [%0], %3 \n" 850 MEMACCESS(0)851 823 "ld1 {v0.s}[2], [%0], %3 \n" 852 MEMACCESS(0)853 824 "ld1 {v0.s}[3], [%0], %3 \n" 854 825 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 855 MEMACCESS(1)856 826 "st1 {v0.16b}, [%1], #16 \n" 857 827 "b.gt 1b \n" … … 868 838 // TODO(Yang Zhang): Might be worth another optimization pass in future. 869 839 // It could be upgraded to 8 pixels at a time to start with. 870 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 840 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, 841 ptrdiff_t src_stride, 871 842 int src_stepx, 872 uint8* dst_argb, int dst_width) { 843 uint8* dst_argb, 844 int dst_width) { 873 845 asm volatile ( 874 846 "add %1, %1, %0 \n" 875 847 "1: \n" 876 MEMACCESS(0)877 848 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 878 MEMACCESS(1)879 849 "ld1 {v1.8b}, [%1], %4 \n" 880 MEMACCESS(0)881 850 "ld1 {v2.8b}, [%0], %4 \n" 882 MEMACCESS(1)883 851 "ld1 {v3.8b}, [%1], %4 \n" 884 MEMACCESS(0)885 852 "ld1 {v4.8b}, [%0], %4 \n" 886 MEMACCESS(1)887 853 "ld1 {v5.8b}, [%1], %4 \n" 888 MEMACCESS(0)889 854 "ld1 {v6.8b}, [%0], %4 \n" 890 MEMACCESS(1)891 855 "ld1 {v7.8b}, [%1], %4 \n" 892 856 "uaddl v0.8h, v0.8b, v1.8b \n" … … 905 869 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 906 870 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 907 MEMACCESS(2)908 871 "st1 {v0.16b}, [%2], #16 \n" 909 872 "b.gt 1b \n" … … 917 880 } 918 881 882 // clang-format off 919 883 // TODO(Yang Zhang): Investigate less load instructions for 920 884 // the x/dx stepping 921 #define LOAD1_DATA32_LANE(vn, n) \ 922 "lsr %5, %3, #16 \n" \ 923 "add %6, %1, %5, lsl #2 \n" \ 924 "add %3, %3, %4 \n" \ 925 MEMACCESS(6) \ 926 "ld1 {"#vn".s}["#n"], [%6] \n" 927 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, 929 int dst_width, int x, int dx) { 885 #define LOAD1_DATA32_LANE(vn, n) \ 886 "lsr %5, %3, #16 \n" \ 887 "add %6, %1, %5, lsl #2 \n" \ 888 "add %3, %3, %4 \n" \ 889 "ld1 {" #vn ".s}[" #n "], [%6] \n" 890 // clang-format on 891 892 void ScaleARGBCols_NEON(uint8* dst_argb, 893 const uint8* src_argb, 894 int dst_width, 895 int x, 896 int dx) { 930 897 const uint8* src_tmp = src_argb; 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 932 int64 x64 = (int64) x; 933 int64 dx64 = (int64) dx; 898 int64 x64 = (int64)x; 899 int64 dx64 = (int64)dx; 934 900 int64 tmp64; 935 901 asm volatile ( … … 944 910 LOAD1_DATA32_LANE(v1, 3) 945 911 946 MEMACCESS(0)947 912 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels 948 913 "subs %w2, %w2, #8 \n" // 8 processed per loop 949 "b.gt 1b\n"914 "b.gt 1b \n" 950 915 : "+r"(dst_argb), // %0 951 916 "+r"(src_argb), // %1 952 "+r"(dst_width 64),// %2917 "+r"(dst_width), // %2 953 918 "+r"(x64), // %3 954 919 "+r"(dx64), // %4 … … 962 927 #undef LOAD1_DATA32_LANE 963 928 929 // clang-format off 964 930 // TODO(Yang Zhang): Investigate less load instructions for 965 931 // the x/dx stepping 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ 967 "lsr %5, %3, #16 \n" \ 968 "add %6, %1, %5, lsl #2 \n" \ 969 "add %3, %3, %4 \n" \ 970 MEMACCESS(6) \ 971 "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" 972 973 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, 974 int dst_width, int x, int dx) { 932 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ 933 "lsr %5, %3, #16 \n" \ 934 "add %6, %1, %5, lsl #2 \n" \ 935 "add %3, %3, %4 \n" \ 936 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" 937 // clang-format on 938 939 void ScaleARGBFilterCols_NEON(uint8* dst_argb, 940 const uint8* src_argb, 941 int dst_width, 942 int x, 943 int dx) { 975 944 int dx_offset[4] = {0, 1, 2, 3}; 976 945 int* tmp = dx_offset; 977 946 const uint8* src_tmp = src_argb; 978 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 979 int64 x64 = (int64) x; 980 int64 dx64 = (int64) dx; 947 int64 x64 = (int64)x; 948 int64 dx64 = (int64)dx; 981 949 asm volatile ( 982 950 "dup v0.4s, %w3 \n" // x … … 1015 983 "shrn2 v0.16b, v17.8h, #7 \n" 1016 984 1017 MEMACCESS(0)1018 985 "st1 {v0.4s}, [%0], #16 \n" // store pixels 1019 986 "add v5.4s, v5.4s, v6.4s \n" … … 1022 989 : "+r"(dst_argb), // %0 1023 990 "+r"(src_argb), // %1 1024 "+r"(dst_width 64),// %2991 "+r"(dst_width), // %2 1025 992 "+r"(x64), // %3 1026 993 "+r"(dx64), // %4
Note: See TracChangeset
for help on using the changeset viewer.