Changeset 5699 for pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/rotate_neon64.cc
r5633 r5699 31 31 int width) { 32 32 const uint8* src_temp; 33 asm volatile 34 // loops are on blocks of 8. loop will stop when35 // counter gets to or below 0. starting the counter36 // at w-8 allow for this37 "sub %w3, %w3, #8 \n"38 39 // handle 8x8 blocks. this should be the majority of the plane40 "1: \n"33 asm volatile( 34 // loops are on blocks of 8. loop will stop when 35 // counter gets to or below 0. starting the counter 36 // at w-8 allow for this 37 "sub %w3, %w3, #8 \n" 38 39 // handle 8x8 blocks. this should be the majority of the plane 40 "1: \n" 41 41 "mov %0, %1 \n" 42 42 … … 93 93 "b.ge 1b \n" 94 94 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23" 197 ); 95 // add 8 back to counter. if the result is 0 there are 96 // no residuals. 97 "adds %w3, %w3, #8 \n" 98 "b.eq 4f \n" 99 100 // some residual, so between 1 and 7 lines left to transpose 101 "cmp %w3, #2 \n" 102 "b.lt 3f \n" 103 104 "cmp %w3, #4 \n" 105 "b.lt 2f \n" 106 107 // 4x8 block 108 "mov %0, %1 \n" 109 "ld1 {v0.s}[0], [%0], %5 \n" 110 "ld1 {v0.s}[1], [%0], %5 \n" 111 "ld1 {v0.s}[2], [%0], %5 \n" 112 "ld1 {v0.s}[3], [%0], %5 \n" 113 "ld1 {v1.s}[0], [%0], %5 \n" 114 "ld1 {v1.s}[1], [%0], %5 \n" 115 "ld1 {v1.s}[2], [%0], %5 \n" 116 "ld1 {v1.s}[3], [%0] \n" 117 118 "mov %0, %2 \n" 119 120 "ld1 {v2.16b}, [%4] \n" 121 122 "tbl v3.16b, {v0.16b}, v2.16b \n" 123 "tbl v0.16b, {v1.16b}, v2.16b \n" 124 125 // TODO(frkoenig): Rework shuffle above to 126 // write out with 4 instead of 8 writes. 127 "st1 {v3.s}[0], [%0], %6 \n" 128 "st1 {v3.s}[1], [%0], %6 \n" 129 "st1 {v3.s}[2], [%0], %6 \n" 130 "st1 {v3.s}[3], [%0] \n" 131 132 "add %0, %2, #4 \n" 133 "st1 {v0.s}[0], [%0], %6 \n" 134 "st1 {v0.s}[1], [%0], %6 \n" 135 "st1 {v0.s}[2], [%0], %6 \n" 136 "st1 {v0.s}[3], [%0] \n" 137 138 "add %1, %1, #4 \n" // src += 4 139 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 140 "subs %w3, %w3, #4 \n" // w -= 4 141 "b.eq 4f \n" 142 143 // some residual, check to see if it includes a 2x8 block, 144 // or less 145 "cmp %w3, #2 \n" 146 "b.lt 3f \n" 147 148 // 2x8 block 149 "2: \n" 150 "mov %0, %1 \n" 151 "ld1 {v0.h}[0], [%0], %5 \n" 152 "ld1 {v1.h}[0], [%0], %5 \n" 153 "ld1 {v0.h}[1], [%0], %5 \n" 154 "ld1 {v1.h}[1], [%0], %5 \n" 155 "ld1 {v0.h}[2], [%0], %5 \n" 156 "ld1 {v1.h}[2], [%0], %5 \n" 157 "ld1 {v0.h}[3], [%0], %5 \n" 158 "ld1 {v1.h}[3], [%0] \n" 159 160 "trn2 v2.8b, v0.8b, v1.8b \n" 161 "trn1 v3.8b, v0.8b, v1.8b \n" 162 163 "mov %0, %2 \n" 164 165 "st1 {v3.8b}, [%0], %6 \n" 166 "st1 {v2.8b}, [%0] \n" 167 168 "add %1, %1, #2 \n" // src += 2 169 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 170 "subs %w3, %w3, #2 \n" // w -= 2 171 "b.eq 4f \n" 172 173 // 1x8 block 174 "3: \n" 175 "ld1 {v0.b}[0], [%1], %5 \n" 176 "ld1 {v0.b}[1], [%1], %5 \n" 177 "ld1 {v0.b}[2], [%1], %5 \n" 178 "ld1 {v0.b}[3], [%1], %5 \n" 179 "ld1 {v0.b}[4], [%1], %5 \n" 180 "ld1 {v0.b}[5], [%1], %5 \n" 181 "ld1 {v0.b}[6], [%1], %5 \n" 182 "ld1 {v0.b}[7], [%1] \n" 183 184 "st1 {v0.8b}, [%2] \n" 185 186 "4: \n" 187 188 : "=&r"(src_temp), // %0 189 "+r"(src), // %1 190 "+r"(dst), // %2 191 "+r"(width) // %3 192 : "r"(&kVTbl4x4Transpose), // %4 193 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 194 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 195 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 196 "v17", "v18", "v19", "v20", "v21", "v22", "v23"); 198 197 } 199 198 … … 210 209 int width) { 211 210 const uint8* src_temp; 212 asm volatile ( 213 // loops are on blocks of 8. loop will stop when 214 // counter gets to or below 0. starting the counter 215 // at w-8 allow for this 216 "sub %w4, %w4, #8 \n" 217 218 // handle 8x8 blocks. this should be the majority of the plane 219 "1: \n" 220 "mov %0, %1 \n" 221 222 "ld1 {v0.16b}, [%0], %5 \n" 223 "ld1 {v1.16b}, [%0], %5 \n" 224 "ld1 {v2.16b}, [%0], %5 \n" 225 "ld1 {v3.16b}, [%0], %5 \n" 226 "ld1 {v4.16b}, [%0], %5 \n" 227 "ld1 {v5.16b}, [%0], %5 \n" 228 "ld1 {v6.16b}, [%0], %5 \n" 229 "ld1 {v7.16b}, [%0] \n" 230 231 "trn1 v16.16b, v0.16b, v1.16b \n" 232 "trn2 v17.16b, v0.16b, v1.16b \n" 233 "trn1 v18.16b, v2.16b, v3.16b \n" 234 "trn2 v19.16b, v2.16b, v3.16b \n" 235 "trn1 v20.16b, v4.16b, v5.16b \n" 236 "trn2 v21.16b, v4.16b, v5.16b \n" 237 "trn1 v22.16b, v6.16b, v7.16b \n" 238 "trn2 v23.16b, v6.16b, v7.16b \n" 239 240 "trn1 v0.8h, v16.8h, v18.8h \n" 241 "trn2 v1.8h, v16.8h, v18.8h \n" 242 "trn1 v2.8h, v20.8h, v22.8h \n" 243 "trn2 v3.8h, v20.8h, v22.8h \n" 244 "trn1 v4.8h, v17.8h, v19.8h \n" 245 "trn2 v5.8h, v17.8h, v19.8h \n" 246 "trn1 v6.8h, v21.8h, v23.8h \n" 247 "trn2 v7.8h, v21.8h, v23.8h \n" 248 249 "trn1 v16.4s, v0.4s, v2.4s \n" 250 "trn2 v17.4s, v0.4s, v2.4s \n" 251 "trn1 v18.4s, v1.4s, v3.4s \n" 252 "trn2 v19.4s, v1.4s, v3.4s \n" 253 "trn1 v20.4s, v4.4s, v6.4s \n" 254 "trn2 v21.4s, v4.4s, v6.4s \n" 255 "trn1 v22.4s, v5.4s, v7.4s \n" 256 "trn2 v23.4s, v5.4s, v7.4s \n" 257 258 "mov %0, %2 \n" 259 260 "st1 {v16.d}[0], [%0], %6 \n" 261 "st1 {v18.d}[0], [%0], %6 \n" 262 "st1 {v17.d}[0], [%0], %6 \n" 263 "st1 {v19.d}[0], [%0], %6 \n" 264 "st1 {v16.d}[1], [%0], %6 \n" 265 "st1 {v18.d}[1], [%0], %6 \n" 266 "st1 {v17.d}[1], [%0], %6 \n" 267 "st1 {v19.d}[1], [%0] \n" 268 269 "mov %0, %3 \n" 270 271 "st1 {v20.d}[0], [%0], %7 \n" 272 "st1 {v22.d}[0], [%0], %7 \n" 273 "st1 {v21.d}[0], [%0], %7 \n" 274 "st1 {v23.d}[0], [%0], %7 \n" 275 "st1 {v20.d}[1], [%0], %7 \n" 276 "st1 {v22.d}[1], [%0], %7 \n" 277 "st1 {v21.d}[1], [%0], %7 \n" 278 "st1 {v23.d}[1], [%0] \n" 279 280 "add %1, %1, #16 \n" // src += 8*2 281 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b 283 "subs %w4, %w4, #8 \n" // w -= 8 284 "b.ge 1b \n" 285 286 // add 8 back to counter. if the result is 0 there are 287 // no residuals. 288 "adds %w4, %w4, #8 \n" 289 "b.eq 4f \n" 290 291 // some residual, so between 1 and 7 lines left to transpose 292 "cmp %w4, #2 \n" 293 "b.lt 3f \n" 294 295 "cmp %w4, #4 \n" 296 "b.lt 2f \n" 297 298 // TODO(frkoenig): Clean this up 299 // 4x8 block 300 "mov %0, %1 \n" 301 "ld1 {v0.8b}, [%0], %5 \n" 302 "ld1 {v1.8b}, [%0], %5 \n" 303 "ld1 {v2.8b}, [%0], %5 \n" 304 "ld1 {v3.8b}, [%0], %5 \n" 305 "ld1 {v4.8b}, [%0], %5 \n" 306 "ld1 {v5.8b}, [%0], %5 \n" 307 "ld1 {v6.8b}, [%0], %5 \n" 308 "ld1 {v7.8b}, [%0] \n" 309 310 "ld1 {v30.16b}, [%8], #16 \n" 311 "ld1 {v31.16b}, [%8] \n" 312 313 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 314 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 315 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 316 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 317 318 "mov %0, %2 \n" 319 320 "st1 {v16.s}[0], [%0], %6 \n" 321 "st1 {v16.s}[1], [%0], %6 \n" 322 "st1 {v16.s}[2], [%0], %6 \n" 323 "st1 {v16.s}[3], [%0], %6 \n" 324 325 "add %0, %2, #4 \n" 326 "st1 {v18.s}[0], [%0], %6 \n" 327 "st1 {v18.s}[1], [%0], %6 \n" 328 "st1 {v18.s}[2], [%0], %6 \n" 329 "st1 {v18.s}[3], [%0] \n" 330 331 "mov %0, %3 \n" 332 333 "st1 {v17.s}[0], [%0], %7 \n" 334 "st1 {v17.s}[1], [%0], %7 \n" 335 "st1 {v17.s}[2], [%0], %7 \n" 336 "st1 {v17.s}[3], [%0], %7 \n" 337 338 "add %0, %3, #4 \n" 339 "st1 {v19.s}[0], [%0], %7 \n" 340 "st1 {v19.s}[1], [%0], %7 \n" 341 "st1 {v19.s}[2], [%0], %7 \n" 342 "st1 {v19.s}[3], [%0] \n" 343 344 "add %1, %1, #8 \n" // src += 4 * 2 345 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a 346 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b 347 "subs %w4, %w4, #4 \n" // w -= 4 348 "b.eq 4f \n" 349 350 // some residual, check to see if it includes a 2x8 block, 351 // or less 352 "cmp %w4, #2 \n" 353 "b.lt 3f \n" 354 355 // 2x8 block 356 "2: \n" 357 "mov %0, %1 \n" 358 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 359 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 360 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 361 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 362 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 363 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 364 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 365 "ld2 {v2.h, v3.h}[3], [%0] \n" 366 367 "trn1 v4.8b, v0.8b, v2.8b \n" 368 "trn2 v5.8b, v0.8b, v2.8b \n" 369 "trn1 v6.8b, v1.8b, v3.8b \n" 370 "trn2 v7.8b, v1.8b, v3.8b \n" 371 372 "mov %0, %2 \n" 373 374 "st1 {v4.d}[0], [%0], %6 \n" 375 "st1 {v6.d}[0], [%0] \n" 376 377 "mov %0, %3 \n" 378 379 "st1 {v5.d}[0], [%0], %7 \n" 380 "st1 {v7.d}[0], [%0] \n" 381 382 "add %1, %1, #4 \n" // src += 2 * 2 383 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a 384 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b 385 "subs %w4, %w4, #2 \n" // w -= 2 386 "b.eq 4f \n" 387 388 // 1x8 block 389 "3: \n" 390 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 391 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 392 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 393 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 394 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 395 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[7], [%1] \n" 398 399 "st1 {v0.d}[0], [%2] \n" 400 "st1 {v1.d}[0], [%3] \n" 401 402 "4: \n" 403 404 : "=&r"(src_temp), // %0 405 "+r"(src), // %1 406 "+r"(dst_a), // %2 407 "+r"(dst_b), // %3 408 "+r"(width) // %4 409 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 410 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 411 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 412 "r"(&kVTbl4x4TransposeDi) // %8 413 : "memory", "cc", 414 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 415 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", 416 "v30", "v31" 417 ); 211 asm volatile( 212 // loops are on blocks of 8. loop will stop when 213 // counter gets to or below 0. starting the counter 214 // at w-8 allow for this 215 "sub %w4, %w4, #8 \n" 216 217 // handle 8x8 blocks. this should be the majority of the plane 218 "1: \n" 219 "mov %0, %1 \n" 220 221 "ld1 {v0.16b}, [%0], %5 \n" 222 "ld1 {v1.16b}, [%0], %5 \n" 223 "ld1 {v2.16b}, [%0], %5 \n" 224 "ld1 {v3.16b}, [%0], %5 \n" 225 "ld1 {v4.16b}, [%0], %5 \n" 226 "ld1 {v5.16b}, [%0], %5 \n" 227 "ld1 {v6.16b}, [%0], %5 \n" 228 "ld1 {v7.16b}, [%0] \n" 229 230 "trn1 v16.16b, v0.16b, v1.16b \n" 231 "trn2 v17.16b, v0.16b, v1.16b \n" 232 "trn1 v18.16b, v2.16b, v3.16b \n" 233 "trn2 v19.16b, v2.16b, v3.16b \n" 234 "trn1 v20.16b, v4.16b, v5.16b \n" 235 "trn2 v21.16b, v4.16b, v5.16b \n" 236 "trn1 v22.16b, v6.16b, v7.16b \n" 237 "trn2 v23.16b, v6.16b, v7.16b \n" 238 239 "trn1 v0.8h, v16.8h, v18.8h \n" 240 "trn2 v1.8h, v16.8h, v18.8h \n" 241 "trn1 v2.8h, v20.8h, v22.8h \n" 242 "trn2 v3.8h, v20.8h, v22.8h \n" 243 "trn1 v4.8h, v17.8h, v19.8h \n" 244 "trn2 v5.8h, v17.8h, v19.8h \n" 245 "trn1 v6.8h, v21.8h, v23.8h \n" 246 "trn2 v7.8h, v21.8h, v23.8h \n" 247 248 "trn1 v16.4s, v0.4s, v2.4s \n" 249 "trn2 v17.4s, v0.4s, v2.4s \n" 250 "trn1 v18.4s, v1.4s, v3.4s \n" 251 "trn2 v19.4s, v1.4s, v3.4s \n" 252 "trn1 v20.4s, v4.4s, v6.4s \n" 253 "trn2 v21.4s, v4.4s, v6.4s \n" 254 "trn1 v22.4s, v5.4s, v7.4s \n" 255 "trn2 v23.4s, v5.4s, v7.4s \n" 256 257 "mov %0, %2 \n" 258 259 "st1 {v16.d}[0], [%0], %6 \n" 260 "st1 {v18.d}[0], [%0], %6 \n" 261 "st1 {v17.d}[0], [%0], %6 \n" 262 "st1 {v19.d}[0], [%0], %6 \n" 263 "st1 {v16.d}[1], [%0], %6 \n" 264 "st1 {v18.d}[1], [%0], %6 \n" 265 "st1 {v17.d}[1], [%0], %6 \n" 266 "st1 {v19.d}[1], [%0] \n" 267 268 "mov %0, %3 \n" 269 270 "st1 {v20.d}[0], [%0], %7 \n" 271 "st1 {v22.d}[0], [%0], %7 \n" 272 "st1 {v21.d}[0], [%0], %7 \n" 273 "st1 {v23.d}[0], [%0], %7 \n" 274 "st1 {v20.d}[1], [%0], %7 \n" 275 "st1 {v22.d}[1], [%0], %7 \n" 276 "st1 {v21.d}[1], [%0], %7 \n" 277 "st1 {v23.d}[1], [%0] \n" 278 279 "add %1, %1, #16 \n" // src += 8*2 280 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * 281 // dst_stride_a 282 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * 283 // dst_stride_b 284 "subs %w4, %w4, #8 \n" // w -= 8 285 "b.ge 1b \n" 286 287 // add 8 back to counter. if the result is 0 there are 288 // no residuals. 289 "adds %w4, %w4, #8 \n" 290 "b.eq 4f \n" 291 292 // some residual, so between 1 and 7 lines left to transpose 293 "cmp %w4, #2 \n" 294 "b.lt 3f \n" 295 296 "cmp %w4, #4 \n" 297 "b.lt 2f \n" 298 299 // TODO(frkoenig): Clean this up 300 // 4x8 block 301 "mov %0, %1 \n" 302 "ld1 {v0.8b}, [%0], %5 \n" 303 "ld1 {v1.8b}, [%0], %5 \n" 304 "ld1 {v2.8b}, [%0], %5 \n" 305 "ld1 {v3.8b}, [%0], %5 \n" 306 "ld1 {v4.8b}, [%0], %5 \n" 307 "ld1 {v5.8b}, [%0], %5 \n" 308 "ld1 {v6.8b}, [%0], %5 \n" 309 "ld1 {v7.8b}, [%0] \n" 310 311 "ld1 {v30.16b}, [%8], #16 \n" 312 "ld1 {v31.16b}, [%8] \n" 313 314 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 315 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 316 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 317 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 318 319 "mov %0, %2 \n" 320 321 "st1 {v16.s}[0], [%0], %6 \n" 322 "st1 {v16.s}[1], [%0], %6 \n" 323 "st1 {v16.s}[2], [%0], %6 \n" 324 "st1 {v16.s}[3], [%0], %6 \n" 325 326 "add %0, %2, #4 \n" 327 "st1 {v18.s}[0], [%0], %6 \n" 328 "st1 {v18.s}[1], [%0], %6 \n" 329 "st1 {v18.s}[2], [%0], %6 \n" 330 "st1 {v18.s}[3], [%0] \n" 331 332 "mov %0, %3 \n" 333 334 "st1 {v17.s}[0], [%0], %7 \n" 335 "st1 {v17.s}[1], [%0], %7 \n" 336 "st1 {v17.s}[2], [%0], %7 \n" 337 "st1 {v17.s}[3], [%0], %7 \n" 338 339 "add %0, %3, #4 \n" 340 "st1 {v19.s}[0], [%0], %7 \n" 341 "st1 {v19.s}[1], [%0], %7 \n" 342 "st1 {v19.s}[2], [%0], %7 \n" 343 "st1 {v19.s}[3], [%0] \n" 344 345 "add %1, %1, #8 \n" // src += 4 * 2 346 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * 347 // dst_stride_a 348 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * 349 // dst_stride_b 350 "subs %w4, %w4, #4 \n" // w -= 4 351 "b.eq 4f \n" 352 353 // some residual, check to see if it includes a 2x8 block, 354 // or less 355 "cmp %w4, #2 \n" 356 "b.lt 3f \n" 357 358 // 2x8 block 359 "2: \n" 360 "mov %0, %1 \n" 361 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 362 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 363 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 364 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 365 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 366 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 367 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 368 "ld2 {v2.h, v3.h}[3], [%0] \n" 369 370 "trn1 v4.8b, v0.8b, v2.8b \n" 371 "trn2 v5.8b, v0.8b, v2.8b \n" 372 "trn1 v6.8b, v1.8b, v3.8b \n" 373 "trn2 v7.8b, v1.8b, v3.8b \n" 374 375 "mov %0, %2 \n" 376 377 "st1 {v4.d}[0], [%0], %6 \n" 378 "st1 {v6.d}[0], [%0] \n" 379 380 "mov %0, %3 \n" 381 382 "st1 {v5.d}[0], [%0], %7 \n" 383 "st1 {v7.d}[0], [%0] \n" 384 385 "add %1, %1, #4 \n" // src += 2 * 2 386 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * 387 // dst_stride_a 388 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * 389 // dst_stride_b 390 "subs %w4, %w4, #2 \n" // w -= 2 391 "b.eq 4f \n" 392 393 // 1x8 block 394 "3: \n" 395 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 396 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 397 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 398 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 399 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 400 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 401 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 402 "ld2 {v0.b, v1.b}[7], [%1] \n" 403 404 "st1 {v0.d}[0], [%2] \n" 405 "st1 {v1.d}[0], [%3] \n" 406 407 "4: \n" 408 409 : "=&r"(src_temp), // %0 410 "+r"(src), // %1 411 "+r"(dst_a), // %2 412 "+r"(dst_b), // %3 413 "+r"(width) // %4 414 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 415 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 416 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 417 "r"(&kVTbl4x4TransposeDi) // %8 418 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 419 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); 418 420 } 419 421 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
Note: See TracChangeset
for help on using the changeset viewer.