Changeset 5633 for pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
- Timestamp:
- Jul 28, 2017 2:51:44 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
r5358 r5633 9 9 */ 10 10 11 #include "libyuv/rotate_row.h" 11 12 #include "libyuv/row.h" 12 #include "libyuv/rotate_row.h"13 13 14 14 #include "libyuv/basic_types.h" … … 22 22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 23 23 24 static uvec8 kVTbl4x4Transpose = 25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 26 27 void TransposeWx8_NEON(const uint8* src, int src_stride, 28 uint8* dst, int dst_stride, int width) { 24 static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, 25 2, 6, 10, 14, 3, 7, 11, 15}; 26 27 void TransposeWx8_NEON(const uint8* src, 28 int src_stride, 29 uint8* dst, 30 int dst_stride, 31 int width) { 29 32 const uint8* src_temp; 30 int64 width64 = (int64) width; // Work around clang 3.4 warning.31 33 asm volatile ( 32 34 // loops are on blocks of 8. loop will stop when 33 35 // counter gets to or below 0. starting the counter 34 36 // at w-8 allow for this 35 "sub % 3, %3, #8\n"37 "sub %w3, %w3, #8 \n" 36 38 37 39 // handle 8x8 blocks. this should be the majority of the plane … … 39 41 "mov %0, %1 \n" 40 42 41 MEMACCESS(0)42 43 "ld1 {v0.8b}, [%0], %5 \n" 43 MEMACCESS(0)44 44 "ld1 {v1.8b}, [%0], %5 \n" 45 MEMACCESS(0)46 45 "ld1 {v2.8b}, [%0], %5 \n" 47 MEMACCESS(0)48 46 "ld1 {v3.8b}, [%0], %5 \n" 49 MEMACCESS(0)50 47 "ld1 {v4.8b}, [%0], %5 \n" 51 MEMACCESS(0)52 48 "ld1 {v5.8b}, [%0], %5 \n" 53 MEMACCESS(0)54 49 "ld1 {v6.8b}, [%0], %5 \n" 55 MEMACCESS(0)56 50 "ld1 {v7.8b}, [%0] \n" 57 51 … … 85 79 "mov %0, %2 \n" 86 80 87 MEMACCESS(0)88 81 "st1 {v17.8b}, [%0], %6 \n" 89 MEMACCESS(0)90 82 "st1 {v16.8b}, [%0], %6 \n" 91 MEMACCESS(0)92 83 "st1 {v19.8b}, [%0], %6 \n" 93 MEMACCESS(0)94 84 "st1 {v18.8b}, [%0], %6 \n" 95 MEMACCESS(0)96 85 "st1 {v21.8b}, [%0], %6 \n" 97 MEMACCESS(0)98 86 "st1 {v20.8b}, [%0], %6 \n" 99 MEMACCESS(0)100 87 "st1 {v23.8b}, [%0], %6 \n" 101 MEMACCESS(0)102 88 "st1 {v22.8b}, [%0] \n" 103 89 104 90 "add %1, %1, #8 \n" // src += 8 105 91 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride 106 "subs % 3, %3, #8\n" // w -= 892 "subs %w3, %w3, #8 \n" // w -= 8 107 93 "b.ge 1b \n" 108 94 109 95 // add 8 back to counter. if the result is 0 there are 110 96 // no residuals. 111 "adds % 3, %3, #8\n"97 "adds %w3, %w3, #8 \n" 112 98 "b.eq 4f \n" 113 99 114 100 // some residual, so between 1 and 7 lines left to transpose 115 "cmp % 3, #2 \n"101 "cmp %w3, #2 \n" 116 102 "b.lt 3f \n" 117 103 118 "cmp % 3, #4 \n"104 "cmp %w3, #4 \n" 119 105 "b.lt 2f \n" 120 106 121 107 // 4x8 block 122 108 "mov %0, %1 \n" 123 MEMACCESS(0)124 109 "ld1 {v0.s}[0], [%0], %5 \n" 125 MEMACCESS(0)126 110 "ld1 {v0.s}[1], [%0], %5 \n" 127 MEMACCESS(0)128 111 "ld1 {v0.s}[2], [%0], %5 \n" 129 MEMACCESS(0)130 112 "ld1 {v0.s}[3], [%0], %5 \n" 131 MEMACCESS(0)132 113 "ld1 {v1.s}[0], [%0], %5 \n" 133 MEMACCESS(0)134 114 "ld1 {v1.s}[1], [%0], %5 \n" 135 MEMACCESS(0)136 115 "ld1 {v1.s}[2], [%0], %5 \n" 137 MEMACCESS(0)138 116 "ld1 {v1.s}[3], [%0] \n" 139 117 140 118 "mov %0, %2 \n" 141 119 142 MEMACCESS(4)143 120 "ld1 {v2.16b}, [%4] \n" 144 121 … … 148 125 // TODO(frkoenig): Rework shuffle above to 149 126 // write out with 4 instead of 8 writes. 150 MEMACCESS(0)151 127 "st1 {v3.s}[0], [%0], %6 \n" 152 MEMACCESS(0)153 128 "st1 {v3.s}[1], [%0], %6 \n" 154 MEMACCESS(0)155 129 "st1 {v3.s}[2], [%0], %6 \n" 156 MEMACCESS(0)157 130 "st1 {v3.s}[3], [%0] \n" 158 131 159 132 "add %0, %2, #4 \n" 160 MEMACCESS(0)161 133 "st1 {v0.s}[0], [%0], %6 \n" 162 MEMACCESS(0)163 134 "st1 {v0.s}[1], [%0], %6 \n" 164 MEMACCESS(0)165 135 "st1 {v0.s}[2], [%0], %6 \n" 166 MEMACCESS(0)167 136 "st1 {v0.s}[3], [%0] \n" 168 137 169 138 "add %1, %1, #4 \n" // src += 4 170 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 171 "subs % 3, %3, #4\n" // w -= 4140 "subs %w3, %w3, #4 \n" // w -= 4 172 141 "b.eq 4f \n" 173 142 174 143 // some residual, check to see if it includes a 2x8 block, 175 144 // or less 176 "cmp % 3, #2\n"145 "cmp %w3, #2 \n" 177 146 "b.lt 3f \n" 178 147 … … 180 149 "2: \n" 181 150 "mov %0, %1 \n" 182 MEMACCESS(0)183 151 "ld1 {v0.h}[0], [%0], %5 \n" 184 MEMACCESS(0)185 152 "ld1 {v1.h}[0], [%0], %5 \n" 186 MEMACCESS(0)187 153 "ld1 {v0.h}[1], [%0], %5 \n" 188 MEMACCESS(0)189 154 "ld1 {v1.h}[1], [%0], %5 \n" 190 MEMACCESS(0)191 155 "ld1 {v0.h}[2], [%0], %5 \n" 192 MEMACCESS(0)193 156 "ld1 {v1.h}[2], [%0], %5 \n" 194 MEMACCESS(0)195 157 "ld1 {v0.h}[3], [%0], %5 \n" 196 MEMACCESS(0)197 158 "ld1 {v1.h}[3], [%0] \n" 198 159 … … 202 163 "mov %0, %2 \n" 203 164 204 MEMACCESS(0)205 165 "st1 {v3.8b}, [%0], %6 \n" 206 MEMACCESS(0)207 166 "st1 {v2.8b}, [%0] \n" 208 167 209 168 "add %1, %1, #2 \n" // src += 2 210 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 211 "subs % 3, %3, #2\n" // w -= 2170 "subs %w3, %w3, #2 \n" // w -= 2 212 171 "b.eq 4f \n" 213 172 214 173 // 1x8 block 215 174 "3: \n" 216 MEMACCESS(1)217 175 "ld1 {v0.b}[0], [%1], %5 \n" 218 MEMACCESS(1)219 176 "ld1 {v0.b}[1], [%1], %5 \n" 220 MEMACCESS(1)221 177 "ld1 {v0.b}[2], [%1], %5 \n" 222 MEMACCESS(1)223 178 "ld1 {v0.b}[3], [%1], %5 \n" 224 MEMACCESS(1)225 179 "ld1 {v0.b}[4], [%1], %5 \n" 226 MEMACCESS(1)227 180 "ld1 {v0.b}[5], [%1], %5 \n" 228 MEMACCESS(1)229 181 "ld1 {v0.b}[6], [%1], %5 \n" 230 MEMACCESS(1)231 182 "ld1 {v0.b}[7], [%1] \n" 232 183 233 MEMACCESS(2)234 184 "st1 {v0.8b}, [%2] \n" 235 185 … … 239 189 "+r"(src), // %1 240 190 "+r"(dst), // %2 241 "+r"(width 64)// %3191 "+r"(width) // %3 242 192 : "r"(&kVTbl4x4Transpose), // %4 243 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 … … 248 198 } 249 199 250 static uint8 kVTbl4x4TransposeDi[32] = 251 { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, 252 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; 253 254 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 255 uint8* dst_a, int dst_stride_a, 256 uint8* dst_b, int dst_stride_b, 200 static uint8 kVTbl4x4TransposeDi[32] = { 201 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, 202 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; 203 204 void TransposeUVWx8_NEON(const uint8* src, 205 int src_stride, 206 uint8* dst_a, 207 int dst_stride_a, 208 uint8* dst_b, 209 int dst_stride_b, 257 210 int width) { 258 211 const uint8* src_temp; 259 int64 width64 = (int64) width; // Work around clang 3.4 warning.260 212 asm volatile ( 261 213 // loops are on blocks of 8. loop will stop when 262 214 // counter gets to or below 0. starting the counter 263 215 // at w-8 allow for this 264 "sub % 4, %4, #8\n"216 "sub %w4, %w4, #8 \n" 265 217 266 218 // handle 8x8 blocks. this should be the majority of the plane … … 268 220 "mov %0, %1 \n" 269 221 270 MEMACCESS(0)271 222 "ld1 {v0.16b}, [%0], %5 \n" 272 MEMACCESS(0)273 223 "ld1 {v1.16b}, [%0], %5 \n" 274 MEMACCESS(0)275 224 "ld1 {v2.16b}, [%0], %5 \n" 276 MEMACCESS(0)277 225 "ld1 {v3.16b}, [%0], %5 \n" 278 MEMACCESS(0)279 226 "ld1 {v4.16b}, [%0], %5 \n" 280 MEMACCESS(0)281 227 "ld1 {v5.16b}, [%0], %5 \n" 282 MEMACCESS(0)283 228 "ld1 {v6.16b}, [%0], %5 \n" 284 MEMACCESS(0)285 229 "ld1 {v7.16b}, [%0] \n" 286 230 … … 314 258 "mov %0, %2 \n" 315 259 316 MEMACCESS(0)317 260 "st1 {v16.d}[0], [%0], %6 \n" 318 MEMACCESS(0)319 261 "st1 {v18.d}[0], [%0], %6 \n" 320 MEMACCESS(0)321 262 "st1 {v17.d}[0], [%0], %6 \n" 322 MEMACCESS(0)323 263 "st1 {v19.d}[0], [%0], %6 \n" 324 MEMACCESS(0)325 264 "st1 {v16.d}[1], [%0], %6 \n" 326 MEMACCESS(0)327 265 "st1 {v18.d}[1], [%0], %6 \n" 328 MEMACCESS(0)329 266 "st1 {v17.d}[1], [%0], %6 \n" 330 MEMACCESS(0)331 267 "st1 {v19.d}[1], [%0] \n" 332 268 333 269 "mov %0, %3 \n" 334 270 335 MEMACCESS(0)336 271 "st1 {v20.d}[0], [%0], %7 \n" 337 MEMACCESS(0)338 272 "st1 {v22.d}[0], [%0], %7 \n" 339 MEMACCESS(0)340 273 "st1 {v21.d}[0], [%0], %7 \n" 341 MEMACCESS(0)342 274 "st1 {v23.d}[0], [%0], %7 \n" 343 MEMACCESS(0)344 275 "st1 {v20.d}[1], [%0], %7 \n" 345 MEMACCESS(0)346 276 "st1 {v22.d}[1], [%0], %7 \n" 347 MEMACCESS(0)348 277 "st1 {v21.d}[1], [%0], %7 \n" 349 MEMACCESS(0)350 278 "st1 {v23.d}[1], [%0] \n" 351 279 … … 353 281 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a 354 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b 355 "subs % 4, %4, #8\n" // w -= 8283 "subs %w4, %w4, #8 \n" // w -= 8 356 284 "b.ge 1b \n" 357 285 358 286 // add 8 back to counter. if the result is 0 there are 359 287 // no residuals. 360 "adds % 4, %4, #8\n"288 "adds %w4, %w4, #8 \n" 361 289 "b.eq 4f \n" 362 290 363 291 // some residual, so between 1 and 7 lines left to transpose 364 "cmp % 4, #2\n"292 "cmp %w4, #2 \n" 365 293 "b.lt 3f \n" 366 294 367 "cmp % 4, #4\n"295 "cmp %w4, #4 \n" 368 296 "b.lt 2f \n" 369 297 … … 371 299 // 4x8 block 372 300 "mov %0, %1 \n" 373 MEMACCESS(0)374 301 "ld1 {v0.8b}, [%0], %5 \n" 375 MEMACCESS(0)376 302 "ld1 {v1.8b}, [%0], %5 \n" 377 MEMACCESS(0)378 303 "ld1 {v2.8b}, [%0], %5 \n" 379 MEMACCESS(0)380 304 "ld1 {v3.8b}, [%0], %5 \n" 381 MEMACCESS(0)382 305 "ld1 {v4.8b}, [%0], %5 \n" 383 MEMACCESS(0)384 306 "ld1 {v5.8b}, [%0], %5 \n" 385 MEMACCESS(0)386 307 "ld1 {v6.8b}, [%0], %5 \n" 387 MEMACCESS(0)388 308 "ld1 {v7.8b}, [%0] \n" 389 309 390 MEMACCESS(8)391 310 "ld1 {v30.16b}, [%8], #16 \n" 392 311 "ld1 {v31.16b}, [%8] \n" … … 399 318 "mov %0, %2 \n" 400 319 401 MEMACCESS(0)402 320 "st1 {v16.s}[0], [%0], %6 \n" 403 MEMACCESS(0)404 321 "st1 {v16.s}[1], [%0], %6 \n" 405 MEMACCESS(0)406 322 "st1 {v16.s}[2], [%0], %6 \n" 407 MEMACCESS(0)408 323 "st1 {v16.s}[3], [%0], %6 \n" 409 324 410 325 "add %0, %2, #4 \n" 411 MEMACCESS(0)412 326 "st1 {v18.s}[0], [%0], %6 \n" 413 MEMACCESS(0)414 327 "st1 {v18.s}[1], [%0], %6 \n" 415 MEMACCESS(0)416 328 "st1 {v18.s}[2], [%0], %6 \n" 417 MEMACCESS(0)418 329 "st1 {v18.s}[3], [%0] \n" 419 330 420 331 "mov %0, %3 \n" 421 332 422 MEMACCESS(0)423 333 "st1 {v17.s}[0], [%0], %7 \n" 424 MEMACCESS(0)425 334 "st1 {v17.s}[1], [%0], %7 \n" 426 MEMACCESS(0)427 335 "st1 {v17.s}[2], [%0], %7 \n" 428 MEMACCESS(0)429 336 "st1 {v17.s}[3], [%0], %7 \n" 430 337 431 338 "add %0, %3, #4 \n" 432 MEMACCESS(0)433 339 "st1 {v19.s}[0], [%0], %7 \n" 434 MEMACCESS(0)435 340 "st1 {v19.s}[1], [%0], %7 \n" 436 MEMACCESS(0)437 341 "st1 {v19.s}[2], [%0], %7 \n" 438 MEMACCESS(0)439 342 "st1 {v19.s}[3], [%0] \n" 440 343 … … 442 345 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a 443 346 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b 444 "subs % 4, %4, #4\n" // w -= 4347 "subs %w4, %w4, #4 \n" // w -= 4 445 348 "b.eq 4f \n" 446 349 447 350 // some residual, check to see if it includes a 2x8 block, 448 351 // or less 449 "cmp % 4, #2\n"352 "cmp %w4, #2 \n" 450 353 "b.lt 3f \n" 451 354 … … 453 356 "2: \n" 454 357 "mov %0, %1 \n" 455 MEMACCESS(0)456 358 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 457 MEMACCESS(0)458 359 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 459 MEMACCESS(0)460 360 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 461 MEMACCESS(0)462 361 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 463 MEMACCESS(0)464 362 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 465 MEMACCESS(0)466 363 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 467 MEMACCESS(0)468 364 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 469 MEMACCESS(0)470 365 "ld2 {v2.h, v3.h}[3], [%0] \n" 471 366 … … 477 372 "mov %0, %2 \n" 478 373 479 MEMACCESS(0)480 374 "st1 {v4.d}[0], [%0], %6 \n" 481 MEMACCESS(0)482 375 "st1 {v6.d}[0], [%0] \n" 483 376 484 377 "mov %0, %3 \n" 485 378 486 MEMACCESS(0)487 379 "st1 {v5.d}[0], [%0], %7 \n" 488 MEMACCESS(0)489 380 "st1 {v7.d}[0], [%0] \n" 490 381 … … 492 383 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a 493 384 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b 494 "subs % 4, %4, #2\n" // w -= 2385 "subs %w4, %w4, #2 \n" // w -= 2 495 386 "b.eq 4f \n" 496 387 497 388 // 1x8 block 498 389 "3: \n" 499 MEMACCESS(1)500 390 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 501 MEMACCESS(1)502 391 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 503 MEMACCESS(1)504 392 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 505 MEMACCESS(1)506 393 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 507 MEMACCESS(1)508 394 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 509 MEMACCESS(1)510 395 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 511 MEMACCESS(1)512 396 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 513 MEMACCESS(1)514 397 "ld2 {v0.b, v1.b}[7], [%1] \n" 515 398 516 MEMACCESS(2)517 399 "st1 {v0.d}[0], [%2] \n" 518 MEMACCESS(3)519 400 "st1 {v1.d}[0], [%3] \n" 520 401 … … 525 406 "+r"(dst_a), // %2 526 407 "+r"(dst_b), // %3 527 "+r"(width 64)// %4408 "+r"(width) // %4 528 409 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 529 410 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
Note: See TracChangeset
for help on using the changeset viewer.