Changeset 5633 for pjproject/trunk/third_party/yuv/source/row_neon64.cc
- Timestamp:
- Jul 28, 2017 2:51:44 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/row_neon64.cc
r5358 r5633 20 20 21 21 // Read 8 Y, 4 U and 4 V from 422 22 #define READYUV422 \ 23 MEMACCESS(0) \ 24 "ld1 {v0.8b}, [%0], #8 \n" \ 25 MEMACCESS(1) \ 26 "ld1 {v1.s}[0], [%1], #4 \n" \ 27 MEMACCESS(2) \ 28 "ld1 {v1.s}[1], [%2], #4 \n" 29 30 // Read 8 Y, 2 U and 2 V from 422 31 #define READYUV411 \ 32 MEMACCESS(0) \ 33 "ld1 {v0.8b}, [%0], #8 \n" \ 34 MEMACCESS(1) \ 35 "ld1 {v2.h}[0], [%1], #2 \n" \ 36 MEMACCESS(2) \ 37 "ld1 {v2.h}[1], [%2], #2 \n" \ 38 "zip1 v1.8b, v2.8b, v2.8b \n" 22 #define READYUV422 \ 23 "ld1 {v0.8b}, [%0], #8 \n" \ 24 "ld1 {v1.s}[0], [%1], #4 \n" \ 25 "ld1 {v1.s}[1], [%2], #4 \n" 39 26 40 27 // Read 8 Y, 8 U and 8 V from 444 41 #define READYUV444 \ 42 MEMACCESS(0) \ 43 "ld1 {v0.8b}, [%0], #8 \n" \ 44 MEMACCESS(1) \ 45 "ld1 {v1.d}[0], [%1], #8 \n" \ 46 MEMACCESS(2) \ 47 "ld1 {v1.d}[1], [%2], #8 \n" \ 48 "uaddlp v1.8h, v1.16b \n" \ 49 "rshrn v1.8b, v1.8h, #1 \n" 28 #define READYUV444 \ 29 "ld1 {v0.8b}, [%0], #8 \n" \ 30 "ld1 {v1.d}[0], [%1], #8 \n" \ 31 "ld1 {v1.d}[1], [%2], #8 \n" \ 32 "uaddlp v1.8h, v1.16b \n" \ 33 "rshrn v1.8b, v1.8h, #1 \n" 50 34 51 35 // Read 8 Y, and set 4 U and 4 V to 128 52 #define READYUV400 \ 53 MEMACCESS(0) \ 54 "ld1 {v0.8b}, [%0], #8 \n" \ 55 "movi v1.8b , #128 \n" 36 #define READYUV400 \ 37 "ld1 {v0.8b}, [%0], #8 \n" \ 38 "movi v1.8b , #128 \n" 56 39 57 40 // Read 8 Y and 4 UV from NV12 58 #define READNV12 \ 59 MEMACCESS(0) \ 60 "ld1 {v0.8b}, [%0], #8 \n" \ 61 MEMACCESS(1) \ 62 "ld1 {v2.8b}, [%1], #8 \n" \ 63 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 64 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 65 "ins v1.s[1], v3.s[0] \n" 41 #define READNV12 \ 42 "ld1 {v0.8b}, [%0], #8 \n" \ 43 "ld1 {v2.8b}, [%1], #8 \n" \ 44 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 45 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 46 "ins v1.s[1], v3.s[0] \n" 66 47 67 48 // Read 8 Y and 4 VU from NV21 68 #define READNV21 \ 69 MEMACCESS(0) \ 70 "ld1 {v0.8b}, [%0], #8 \n" \ 71 MEMACCESS(1) \ 72 "ld1 {v2.8b}, [%1], #8 \n" \ 73 "uzp1 v3.8b, v2.8b, v2.8b \n" \ 74 "uzp2 v1.8b, v2.8b, v2.8b \n" \ 75 "ins v1.s[1], v3.s[0] \n" 49 #define READNV21 \ 50 "ld1 {v0.8b}, [%0], #8 \n" \ 51 "ld1 {v2.8b}, [%1], #8 \n" \ 52 "uzp1 v3.8b, v2.8b, v2.8b \n" \ 53 "uzp2 v1.8b, v2.8b, v2.8b \n" \ 54 "ins v1.s[1], v3.s[0] \n" 76 55 77 56 // Read 8 YUY2 78 #define READYUY2 \ 79 MEMACCESS(0) \ 80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ 81 "uzp2 v3.8b, v1.8b, v1.8b \n" \ 82 "uzp1 v1.8b, v1.8b, v1.8b \n" \ 83 "ins v1.s[1], v3.s[0] \n" 57 #define READYUY2 \ 58 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ 59 "uzp2 v3.8b, v1.8b, v1.8b \n" \ 60 "uzp1 v1.8b, v1.8b, v1.8b \n" \ 61 "ins v1.s[1], v3.s[0] \n" 84 62 85 63 // Read 8 UYVY 86 #define READUYVY \ 87 MEMACCESS(0) \ 88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ 89 "orr v0.8b, v3.8b, v3.8b \n" \ 90 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 91 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 92 "ins v1.s[1], v3.s[0] \n" 93 94 #define YUVTORGB_SETUP \ 95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ 96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ 97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ 98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \ 99 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ 100 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" 101 102 #define YUVTORGB(vR, vG, vB) \ 103 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ 104 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ 105 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ 106 "ushll v0.4s, v0.4h, #0 \n" \ 107 "mul v3.4s, v3.4s, v31.4s \n" \ 108 "mul v0.4s, v0.4s, v31.4s \n" \ 109 "sqshrun v0.4h, v0.4s, #16 \n" \ 110 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ 111 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ 112 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ 113 "uxtl v2.8h, v2.8b \n" \ 114 "uxtl v1.8h, v1.8b \n" /* Extract U */ \ 115 "mul v3.8h, v1.8h, v27.8h \n" \ 116 "mul v5.8h, v1.8h, v29.8h \n" \ 117 "mul v6.8h, v2.8h, v30.8h \n" \ 118 "mul v7.8h, v2.8h, v28.8h \n" \ 119 "sqadd v6.8h, v6.8h, v5.8h \n" \ 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ 64 #define READUYVY \ 65 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ 66 "orr v0.8b, v3.8b, v3.8b \n" \ 67 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 68 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 69 "ins v1.s[1], v3.s[0] \n" 70 71 #define YUVTORGB_SETUP \ 72 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ 73 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ 74 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ 75 "ld1r {v31.4s}, [%[kYToRgb]] \n" \ 76 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ 77 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" 78 79 #define YUVTORGB(vR, vG, vB) \ 80 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ 81 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ 82 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ 83 "ushll v0.4s, v0.4h, #0 \n" \ 84 "mul v3.4s, v3.4s, v31.4s \n" \ 85 "mul v0.4s, v0.4s, v31.4s \n" \ 86 "sqshrun v0.4h, v0.4s, #16 \n" \ 87 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ 88 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ 89 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ 90 "uxtl v2.8h, v2.8b \n" \ 91 "uxtl v1.8h, v1.8b \n" /* Extract U */ \ 92 "mul v3.8h, v1.8h, v27.8h \n" \ 93 "mul v5.8h, v1.8h, v29.8h \n" \ 94 "mul v6.8h, v2.8h, v30.8h \n" \ 95 "mul v7.8h, v2.8h, v28.8h \n" \ 96 "sqadd v6.8h, v6.8h, v5.8h \n" \ 97 "sqadd " #vB \ 98 ".8h, v24.8h, v0.8h \n" /* B */ \ 99 "sqadd " #vG \ 100 ".8h, v25.8h, v0.8h \n" /* G */ \ 101 "sqadd " #vR \ 102 ".8h, v26.8h, v0.8h \n" /* R */ \ 103 "sqadd " #vB ".8h, " #vB \ 104 ".8h, v3.8h \n" /* B */ \ 105 "sqsub " #vG ".8h, " #vG \ 106 ".8h, v6.8h \n" /* G */ \ 107 "sqadd " #vR ".8h, " #vR \ 108 ".8h, v7.8h \n" /* R */ \ 109 "sqshrun " #vB ".8b, " #vB \ 110 ".8h, #6 \n" /* B */ \ 111 "sqshrun " #vG ".8b, " #vG \ 112 ".8h, #6 \n" /* G */ \ 113 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ 129 114 130 115 void I444ToARGBRow_NEON(const uint8* src_y, … … 141 126 YUVTORGB(v22, v21, v20) 142 127 "subs %w4, %w4, #8 \n" 143 MEMACCESS(3)144 128 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 145 129 "b.gt 1b \n" … … 171 155 YUVTORGB(v22, v21, v20) 172 156 "subs %w4, %w4, #8 \n" 173 MEMACCESS(3)174 157 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 175 158 "b.gt 1b \n" … … 200 183 READYUV422 201 184 YUVTORGB(v22, v21, v20) 202 MEMACCESS(3)203 185 "ld1 {v23.8b}, [%3], #8 \n" 204 186 "subs %w5, %w5, #8 \n" 205 MEMACCESS(4)206 187 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" 207 188 "b.gt 1b \n" … … 221 202 } 222 203 223 void I411ToARGBRow_NEON(const uint8* src_y,224 const uint8* src_u,225 const uint8* src_v,226 uint8* dst_argb,227 const struct YuvConstants* yuvconstants,228 int width) {229 asm volatile (230 YUVTORGB_SETUP231 "movi v23.8b, #255 \n" /* A */232 "1: \n"233 READYUV411234 YUVTORGB(v22, v21, v20)235 "subs %w4, %w4, #8 \n"236 MEMACCESS(3)237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"238 "b.gt 1b \n"239 : "+r"(src_y), // %0240 "+r"(src_u), // %1241 "+r"(src_v), // %2242 "+r"(dst_argb), // %3243 "+r"(width) // %4244 : [kUVToRB]"r"(&yuvconstants->kUVToRB),245 [kUVToG]"r"(&yuvconstants->kUVToG),246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),247 [kYToRgb]"r"(&yuvconstants->kYToRgb)248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"250 );251 }252 253 204 void I422ToRGBARow_NEON(const uint8* src_y, 254 205 const uint8* src_u, … … 264 215 YUVTORGB(v23, v22, v21) 265 216 "subs %w4, %w4, #8 \n" 266 MEMACCESS(3)267 217 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 268 218 "b.gt 1b \n" … … 293 243 YUVTORGB(v22, v21, v20) 294 244 "subs %w4, %w4, #8 \n" 295 MEMACCESS(3)296 245 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 297 246 "b.gt 1b \n" … … 310 259 } 311 260 312 #define ARGBTORGB565 313 "shll v0.8h, v22.8b, #8 \n"/* R */ \314 "shll v21.8h, v21.8b, #8 \n"/* G */ \315 "shll v20.8h, v20.8b, #8 \n"/* B */ \316 "sri v0.8h, v21.8h, #5 \n"/* RG */ \317 "sri v0.8h, v20.8h, #11 \n"/* RGB */261 #define ARGBTORGB565 \ 262 "shll v0.8h, v22.8b, #8 \n" /* R */ \ 263 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 264 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 265 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ 266 "sri v0.8h, v20.8h, #11 \n" /* RGB */ 318 267 319 268 void I422ToRGB565Row_NEON(const uint8* src_y, … … 323 272 const struct YuvConstants* yuvconstants, 324 273 int width) { 325 asm volatile ( 326 YUVTORGB_SETUP 327 "1: \n" 328 READYUV422 329 YUVTORGB(v22, v21, v20) 330 "subs %w4, %w4, #8 \n" 331 ARGBTORGB565 332 MEMACCESS(3) 333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 334 "b.gt 1b \n" 335 : "+r"(src_y), // %0 336 "+r"(src_u), // %1 337 "+r"(src_v), // %2 338 "+r"(dst_rgb565), // %3 339 "+r"(width) // %4 340 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 341 [kUVToG]"r"(&yuvconstants->kUVToG), 342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 343 [kYToRgb]"r"(&yuvconstants->kYToRgb) 344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 346 ); 347 } 348 349 #define ARGBTOARGB1555 \ 350 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 351 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 352 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 353 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 274 asm volatile( 275 YUVTORGB_SETUP 276 "1: \n" READYUV422 YUVTORGB( 277 v22, v21, 278 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 279 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels 280 // RGB565. 281 "b.gt 1b \n" 282 : "+r"(src_y), // %0 283 "+r"(src_u), // %1 284 "+r"(src_v), // %2 285 "+r"(dst_rgb565), // %3 286 "+r"(width) // %4 287 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 288 [kUVToG] "r"(&yuvconstants->kUVToG), 289 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 290 [kYToRgb] "r"(&yuvconstants->kYToRgb) 291 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 292 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); 293 } 294 295 #define ARGBTOARGB1555 \ 296 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 297 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 298 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 299 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 300 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 301 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 302 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 357 303 358 304 void I422ToARGB1555Row_NEON(const uint8* src_y, … … 362 308 const struct YuvConstants* yuvconstants, 363 309 int width) { 364 asm volatile ( 365 YUVTORGB_SETUP 366 "movi v23.8b, #255 \n" 367 "1: \n" 368 READYUV422 369 YUVTORGB(v22, v21, v20) 370 "subs %w4, %w4, #8 \n" 371 ARGBTOARGB1555 372 MEMACCESS(3) 373 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 374 "b.gt 1b \n" 375 : "+r"(src_y), // %0 376 "+r"(src_u), // %1 377 "+r"(src_v), // %2 378 "+r"(dst_argb1555), // %3 379 "+r"(width) // %4 380 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 381 [kUVToG]"r"(&yuvconstants->kUVToG), 382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 383 [kYToRgb]"r"(&yuvconstants->kYToRgb) 384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 386 ); 387 } 388 389 #define ARGBTOARGB4444 \ 390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 310 asm volatile( 311 YUVTORGB_SETUP 312 "movi v23.8b, #255 \n" 313 "1: \n" READYUV422 YUVTORGB( 314 v22, v21, 315 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 316 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels 317 // RGB565. 318 "b.gt 1b \n" 319 : "+r"(src_y), // %0 320 "+r"(src_u), // %1 321 "+r"(src_v), // %2 322 "+r"(dst_argb1555), // %3 323 "+r"(width) // %4 324 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 325 [kUVToG] "r"(&yuvconstants->kUVToG), 326 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 327 [kYToRgb] "r"(&yuvconstants->kYToRgb) 328 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 329 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); 330 } 331 332 #define ARGBTOARGB4444 \ 333 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 334 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 335 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 336 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 337 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 338 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 339 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 340 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 398 341 399 342 void I422ToARGB4444Row_NEON(const uint8* src_y, … … 412 355 "movi v23.8b, #255 \n" 413 356 ARGBTOARGB4444 414 MEMACCESS(3)415 357 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. 416 358 "b.gt 1b \n" … … 429 371 } 430 372 431 void I400ToARGBRow_NEON(const uint8* src_y, 432 uint8* dst_argb, 433 int width) { 373 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { 434 374 asm volatile ( 435 375 YUVTORGB_SETUP … … 439 379 YUVTORGB(v22, v21, v20) 440 380 "subs %w2, %w2, #8 \n" 441 MEMACCESS(1)442 381 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 443 382 "b.gt 1b \n" … … 454 393 } 455 394 456 void J400ToARGBRow_NEON(const uint8* src_y, 457 uint8* dst_argb, 458 int width) { 459 asm volatile ( 460 "movi v23.8b, #255 \n" 461 "1: \n" 462 MEMACCESS(0) 463 "ld1 {v20.8b}, [%0], #8 \n" 464 "orr v21.8b, v20.8b, v20.8b \n" 465 "orr v22.8b, v20.8b, v20.8b \n" 466 "subs %w2, %w2, #8 \n" 467 MEMACCESS(1) 468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 469 "b.gt 1b \n" 470 : "+r"(src_y), // %0 471 "+r"(dst_argb), // %1 472 "+r"(width) // %2 473 : 474 : "cc", "memory", "v20", "v21", "v22", "v23" 475 ); 395 void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { 396 asm volatile( 397 "movi v23.8b, #255 \n" 398 "1: \n" 399 "ld1 {v20.8b}, [%0], #8 \n" 400 "orr v21.8b, v20.8b, v20.8b \n" 401 "orr v22.8b, v20.8b, v20.8b \n" 402 "subs %w2, %w2, #8 \n" 403 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 404 "b.gt 1b \n" 405 : "+r"(src_y), // %0 406 "+r"(dst_argb), // %1 407 "+r"(width) // %2 408 : 409 : "cc", "memory", "v20", "v21", "v22", "v23"); 476 410 } 477 411 … … 488 422 YUVTORGB(v22, v21, v20) 489 423 "subs %w3, %w3, #8 \n" 490 MEMACCESS(2)491 424 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 492 425 "b.gt 1b \n" … … 516 449 YUVTORGB(v22, v21, v20) 517 450 "subs %w3, %w3, #8 \n" 518 MEMACCESS(2)519 451 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 520 452 "b.gt 1b \n" … … 537 469 const struct YuvConstants* yuvconstants, 538 470 int width) { 539 asm volatile ( 540 YUVTORGB_SETUP 541 "1: \n" 542 READNV12 543 YUVTORGB(v22, v21, v20) 544 "subs %w3, %w3, #8 \n" 545 ARGBTORGB565 546 MEMACCESS(2) 547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 548 "b.gt 1b \n" 549 : "+r"(src_y), // %0 550 "+r"(src_uv), // %1 551 "+r"(dst_rgb565), // %2 552 "+r"(width) // %3 553 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 554 [kUVToG]"r"(&yuvconstants->kUVToG), 555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 559 ); 471 asm volatile( 472 YUVTORGB_SETUP 473 "1: \n" READNV12 YUVTORGB( 474 v22, v21, 475 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 476 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels 477 // RGB565. 478 "b.gt 1b \n" 479 : "+r"(src_y), // %0 480 "+r"(src_uv), // %1 481 "+r"(dst_rgb565), // %2 482 "+r"(width) // %3 483 : [kUVToRB] "r"(&yuvconstants->kUVToRB), 484 [kUVToG] "r"(&yuvconstants->kUVToG), 485 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), 486 [kYToRgb] "r"(&yuvconstants->kYToRgb) 487 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 488 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); 560 489 } 561 490 … … 571 500 YUVTORGB(v22, v21, v20) 572 501 "subs %w2, %w2, #8 \n" 573 MEMACCESS(1)574 502 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 575 503 "b.gt 1b \n" … … 597 525 YUVTORGB(v22, v21, v20) 598 526 "subs %w2, %w2, #8 \n" 599 MEMACCESS(1)600 527 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 601 528 "b.gt 1b \n" … … 613 540 614 541 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 615 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 542 void SplitUVRow_NEON(const uint8* src_uv, 543 uint8* dst_u, 544 uint8* dst_v, 616 545 int width) { 617 asm volatile ( 618 "1: \n" 619 MEMACCESS(0) 620 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 621 "subs %w3, %w3, #16 \n" // 16 processed per loop 622 MEMACCESS(1) 623 "st1 {v0.16b}, [%1], #16 \n" // store U 624 MEMACCESS(2) 625 "st1 {v1.16b}, [%2], #16 \n" // store V 626 "b.gt 1b \n" 627 : "+r"(src_uv), // %0 628 "+r"(dst_u), // %1 629 "+r"(dst_v), // %2 630 "+r"(width) // %3 // Output registers 631 : // Input registers 632 : "cc", "memory", "v0", "v1" // Clobber List 633 ); 546 asm volatile( 547 "1: \n" 548 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 549 "subs %w3, %w3, #16 \n" // 16 processed per loop 550 "st1 {v0.16b}, [%1], #16 \n" // store U 551 "st1 {v1.16b}, [%2], #16 \n" // store V 552 "b.gt 1b \n" 553 : "+r"(src_uv), // %0 554 "+r"(dst_u), // %1 555 "+r"(dst_v), // %2 556 "+r"(width) // %3 // Output registers 557 : // Input registers 558 : "cc", "memory", "v0", "v1" // Clobber List 559 ); 634 560 } 635 561 636 562 // Reads 16 U's and V's and writes out 16 pairs of UV. 637 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 563 void MergeUVRow_NEON(const uint8* src_u, 564 const uint8* src_v, 565 uint8* dst_uv, 638 566 int width) { 639 asm volatile ( 640 "1: \n" 641 MEMACCESS(0) 642 "ld1 {v0.16b}, [%0], #16 \n" // load U 643 MEMACCESS(1) 644 "ld1 {v1.16b}, [%1], #16 \n" // load V 645 "subs %w3, %w3, #16 \n" // 16 processed per loop 646 MEMACCESS(2) 647 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 648 "b.gt 1b \n" 649 : 650 "+r"(src_u), // %0 651 "+r"(src_v), // %1 652 "+r"(dst_uv), // %2 653 "+r"(width) // %3 // Output registers 654 : // Input registers 655 : "cc", "memory", "v0", "v1" // Clobber List 656 ); 567 asm volatile( 568 "1: \n" 569 "ld1 {v0.16b}, [%0], #16 \n" // load U 570 "ld1 {v1.16b}, [%1], #16 \n" // load V 571 "subs %w3, %w3, #16 \n" // 16 processed per loop 572 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 573 "b.gt 1b \n" 574 : "+r"(src_u), // %0 575 "+r"(src_v), // %1 576 "+r"(dst_uv), // %2 577 "+r"(width) // %3 // Output registers 578 : // Input registers 579 : "cc", "memory", "v0", "v1" // Clobber List 580 ); 657 581 } 658 582 659 583 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 660 584 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 661 asm volatile ( 662 "1: \n" 663 MEMACCESS(0) 664 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 665 "subs %w2, %w2, #32 \n" // 32 processed per loop 666 MEMACCESS(1) 667 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 668 "b.gt 1b \n" 669 : "+r"(src), // %0 670 "+r"(dst), // %1 671 "+r"(count) // %2 // Output registers 672 : // Input registers 673 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 674 ); 585 asm volatile( 586 "1: \n" 587 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 588 "subs %w2, %w2, #32 \n" // 32 processed per loop 589 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 590 "b.gt 1b \n" 591 : "+r"(src), // %0 592 "+r"(dst), // %1 593 "+r"(count) // %2 // Output registers 594 : // Input registers 595 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 596 ); 675 597 } 676 598 677 599 // SetRow writes 'count' bytes using an 8 bit value repeated. 678 600 void SetRow_NEON(uint8* dst, uint8 v8, int count) { 679 asm volatile ( 680 "dup v0.16b, %w2 \n" // duplicate 16 bytes 681 "1: \n" 682 "subs %w1, %w1, #16 \n" // 16 bytes per loop 683 MEMACCESS(0) 684 "st1 {v0.16b}, [%0], #16 \n" // store 685 "b.gt 1b \n" 686 : "+r"(dst), // %0 687 "+r"(count) // %1 688 : "r"(v8) // %2 689 : "cc", "memory", "v0" 690 ); 601 asm volatile( 602 "dup v0.16b, %w2 \n" // duplicate 16 bytes 603 "1: \n" 604 "subs %w1, %w1, #16 \n" // 16 bytes per loop 605 "st1 {v0.16b}, [%0], #16 \n" // store 606 "b.gt 1b \n" 607 : "+r"(dst), // %0 608 "+r"(count) // %1 609 : "r"(v8) // %2 610 : "cc", "memory", "v0"); 691 611 } 692 612 693 613 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { 694 asm volatile ( 695 "dup v0.4s, %w2 \n" // duplicate 4 ints 696 "1: \n" 697 "subs %w1, %w1, #4 \n" // 4 ints per loop 698 MEMACCESS(0) 699 "st1 {v0.16b}, [%0], #16 \n" // store 700 "b.gt 1b \n" 701 : "+r"(dst), // %0 702 "+r"(count) // %1 703 : "r"(v32) // %2 704 : "cc", "memory", "v0" 705 ); 614 asm volatile( 615 "dup v0.4s, %w2 \n" // duplicate 4 ints 616 "1: \n" 617 "subs %w1, %w1, #4 \n" // 4 ints per loop 618 "st1 {v0.16b}, [%0], #16 \n" // store 619 "b.gt 1b \n" 620 : "+r"(dst), // %0 621 "+r"(count) // %1 622 : "r"(v32) // %2 623 : "cc", "memory", "v0"); 706 624 } 707 625 708 626 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 709 asm volatile ( 710 // Start at end of source row. 711 "add %0, %0, %w2, sxtw \n" 712 "sub %0, %0, #16 \n" 713 "1: \n" 714 MEMACCESS(0) 715 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 716 "subs %w2, %w2, #16 \n" // 16 pixels per loop. 717 "rev64 v0.16b, v0.16b \n" 718 MEMACCESS(1) 719 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 720 MEMACCESS(1) 721 "st1 {v0.D}[0], [%1], #8 \n" 722 "b.gt 1b \n" 723 : "+r"(src), // %0 724 "+r"(dst), // %1 725 "+r"(width) // %2 726 : "r"((ptrdiff_t)-16) // %3 727 : "cc", "memory", "v0" 728 ); 729 } 730 731 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 627 asm volatile( 628 // Start at end of source row. 629 "add %0, %0, %w2, sxtw \n" 630 "sub %0, %0, #16 \n" 631 "1: \n" 632 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 633 "subs %w2, %w2, #16 \n" // 16 pixels per loop. 634 "rev64 v0.16b, v0.16b \n" 635 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 636 "st1 {v0.D}[0], [%1], #8 \n" 637 "b.gt 1b \n" 638 : "+r"(src), // %0 639 "+r"(dst), // %1 640 "+r"(width) // %2 641 : "r"((ptrdiff_t)-16) // %3 642 : "cc", "memory", "v0"); 643 } 644 645 void MirrorUVRow_NEON(const uint8* src_uv, 646 uint8* dst_u, 647 uint8* dst_v, 732 648 int width) { 733 asm volatile ( 734 // Start at end of source row. 735 "add %0, %0, %w3, sxtw #1 \n" 736 "sub %0, %0, #16 \n" 737 "1: \n" 738 MEMACCESS(0) 739 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 740 "subs %w3, %w3, #8 \n" // 8 pixels per loop. 741 "rev64 v0.8b, v0.8b \n" 742 "rev64 v1.8b, v1.8b \n" 743 MEMACCESS(1) 744 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 745 MEMACCESS(2) 746 "st1 {v1.8b}, [%2], #8 \n" 747 "b.gt 1b \n" 748 : "+r"(src_uv), // %0 749 "+r"(dst_u), // %1 750 "+r"(dst_v), // %2 751 "+r"(width) // %3 752 : "r"((ptrdiff_t)-16) // %4 753 : "cc", "memory", "v0", "v1" 754 ); 649 asm volatile( 650 // Start at end of source row. 651 "add %0, %0, %w3, sxtw #1 \n" 652 "sub %0, %0, #16 \n" 653 "1: \n" 654 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 655 "subs %w3, %w3, #8 \n" // 8 pixels per loop. 656 "rev64 v0.8b, v0.8b \n" 657 "rev64 v1.8b, v1.8b \n" 658 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 659 "st1 {v1.8b}, [%2], #8 \n" 660 "b.gt 1b \n" 661 : "+r"(src_uv), // %0 662 "+r"(dst_u), // %1 663 "+r"(dst_v), // %2 664 "+r"(width) // %3 665 : "r"((ptrdiff_t)-16) // %4 666 : "cc", "memory", "v0", "v1"); 755 667 } 756 668 757 669 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 758 asm volatile ( 759 // Start at end of source row. 760 "add %0, %0, %w2, sxtw #2 \n" 761 "sub %0, %0, #16 \n" 762 "1: \n" 763 MEMACCESS(0) 764 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 765 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 766 "rev64 v0.4s, v0.4s \n" 767 MEMACCESS(1) 768 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 769 MEMACCESS(1) 770 "st1 {v0.D}[0], [%1], #8 \n" 771 "b.gt 1b \n" 772 : "+r"(src), // %0 773 "+r"(dst), // %1 774 "+r"(width) // %2 775 : "r"((ptrdiff_t)-16) // %3 776 : "cc", "memory", "v0" 777 ); 670 asm volatile( 671 // Start at end of source row. 672 "add %0, %0, %w2, sxtw #2 \n" 673 "sub %0, %0, #16 \n" 674 "1: \n" 675 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 676 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 677 "rev64 v0.4s, v0.4s \n" 678 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 679 "st1 {v0.D}[0], [%1], #8 \n" 680 "b.gt 1b \n" 681 : "+r"(src), // %0 682 "+r"(dst), // %1 683 "+r"(width) // %2 684 : "r"((ptrdiff_t)-16) // %3 685 : "cc", "memory", "v0"); 778 686 } 779 687 780 688 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { 781 asm volatile ( 782 "movi v4.8b, #255 \n" // Alpha 783 "1: \n" 784 MEMACCESS(0) 785 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 786 "subs %w2, %w2, #8 \n" // 8 processed per loop. 787 MEMACCESS(1) 788 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 789 "b.gt 1b \n" 790 : "+r"(src_rgb24), // %0 791 "+r"(dst_argb), // %1 792 "+r"(width) // %2 793 : 794 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 795 ); 689 asm volatile( 690 "movi v4.8b, #255 \n" // Alpha 691 "1: \n" 692 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 693 "subs %w2, %w2, #8 \n" // 8 processed per loop. 694 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB 695 // pixels 696 "b.gt 1b \n" 697 : "+r"(src_rgb24), // %0 698 "+r"(dst_argb), // %1 699 "+r"(width) // %2 700 : 701 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 702 ); 796 703 } 797 704 798 705 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { 799 asm volatile ( 800 "movi v5.8b, #255 \n" // Alpha 801 "1: \n" 802 MEMACCESS(0) 803 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 804 "subs %w2, %w2, #8 \n" // 8 processed per loop. 805 "orr v3.8b, v1.8b, v1.8b \n" // move g 806 "orr v4.8b, v0.8b, v0.8b \n" // move r 807 MEMACCESS(1) 808 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 809 "b.gt 1b \n" 810 : "+r"(src_raw), // %0 811 "+r"(dst_argb), // %1 812 "+r"(width) // %2 813 : 814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 815 ); 706 asm volatile( 707 "movi v5.8b, #255 \n" // Alpha 708 "1: \n" 709 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 710 "subs %w2, %w2, #8 \n" // 8 processed per loop. 711 "orr v3.8b, v1.8b, v1.8b \n" // move g 712 "orr v4.8b, v0.8b, v0.8b \n" // move r 713 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 714 "b.gt 1b \n" 715 : "+r"(src_raw), // %0 716 "+r"(dst_argb), // %1 717 "+r"(width) // %2 718 : 719 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 720 ); 816 721 } 817 722 818 723 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 819 asm volatile ( 820 "1: \n" 821 MEMACCESS(0) 822 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 823 "subs %w2, %w2, #8 \n" // 8 processed per loop. 824 "orr v3.8b, v1.8b, v1.8b \n" // move g 825 "orr v4.8b, v0.8b, v0.8b \n" // move r 826 MEMACCESS(1) 827 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r 828 "b.gt 1b \n" 829 : "+r"(src_raw), // %0 830 "+r"(dst_rgb24), // %1 831 "+r"(width) // %2 832 : 833 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 834 ); 835 } 836 837 #define RGB565TOARGB \ 838 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 839 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 840 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 841 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 842 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 843 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 844 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 845 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 846 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 847 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 848 "dup v2.2D, v0.D[1] \n" /* R */ 724 asm volatile( 725 "1: \n" 726 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 727 "subs %w2, %w2, #8 \n" // 8 processed per loop. 728 "orr v3.8b, v1.8b, v1.8b \n" // move g 729 "orr v4.8b, v0.8b, v0.8b \n" // move r 730 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r 731 "b.gt 1b \n" 732 : "+r"(src_raw), // %0 733 "+r"(dst_rgb24), // %1 734 "+r"(width) // %2 735 : 736 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 737 ); 738 } 739 740 #define RGB565TOARGB \ 741 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 742 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 743 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 744 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 745 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 746 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 747 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 748 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 749 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 750 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 751 "dup v2.2D, v0.D[1] \n" /* R */ 849 752 850 753 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { 851 asm volatile ( 852 "movi v3.8b, #255 \n" // Alpha 853 "1: \n" 854 MEMACCESS(0) 855 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 856 "subs %w2, %w2, #8 \n" // 8 processed per loop. 857 RGB565TOARGB 858 MEMACCESS(1) 859 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 860 "b.gt 1b \n" 861 : "+r"(src_rgb565), // %0 862 "+r"(dst_argb), // %1 863 "+r"(width) // %2 864 : 865 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 866 ); 867 } 868 869 #define ARGB1555TOARGB \ 870 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 871 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 872 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 873 \ 874 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 875 "xtn2 v3.16b, v2.8h \n" \ 876 \ 877 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 878 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 879 \ 880 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ 881 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 882 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 883 \ 884 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 885 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ 886 "dup v1.2D, v0.D[1] \n" \ 887 "dup v3.2D, v2.D[1] \n" 754 asm volatile( 755 "movi v3.8b, #255 \n" // Alpha 756 "1: \n" 757 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 758 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 RGB565TOARGB 760 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 761 // pixels 762 "b.gt 1b \n" 763 : "+r"(src_rgb565), // %0 764 "+r"(dst_argb), // %1 765 "+r"(width) // %2 766 : 767 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 768 ); 769 } 770 771 #define ARGB1555TOARGB \ 772 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 773 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 774 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 775 \ 776 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 777 "xtn2 v3.16b, v2.8h \n" \ 778 \ 779 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 780 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 781 \ 782 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ 783 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 784 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 785 \ 786 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 787 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ 788 "dup v1.2D, v0.D[1] \n" \ 789 "dup v3.2D, v2.D[1] \n" 888 790 889 791 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 890 #define RGB555TOARGB \ 891 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 892 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 893 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ 894 \ 895 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 896 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 897 \ 898 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 899 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 900 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 901 \ 902 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 903 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 904 "dup v1.2D, v0.D[1] \n" /* G */ \ 905 906 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 792 #define RGB555TOARGB \ 793 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 794 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 795 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ 796 \ 797 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 798 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 799 \ 800 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 801 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 802 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 803 \ 804 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 805 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 806 "dup v1.2D, v0.D[1] \n" /* G */ 807 808 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, 809 uint8* dst_argb, 907 810 int width) { 908 asm volatile 909 "movi v3.8b, #255 \n" // Alpha910 "1: \n"911 MEMACCESS(0)912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.913 "subs %w2, %w2, #8 \n" // 8 processed per loop.914 ARGB1555TOARGB915 MEMACCESS(1)916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels917 "b.gt 1b \n"918 : "+r"(src_argb1555), // %0919 "+r"(dst_argb), // %1920 "+r"(width) // %2921 :922 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List923 ); 924 } 925 926 #define ARGB4444TOARGB\927 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR*/ \928 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB*/ \929 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000*/ \930 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG*/ \931 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB*/ \932 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000*/ \933 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB*/ \934 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */\935 "dup v0.2D, v2.D[1] \n" \936 "dup v1.2D, v3.D[1] \n" 937 938 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,uint8* dst_argb,811 asm volatile( 812 "movi v3.8b, #255 \n" // Alpha 813 "1: \n" 814 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 815 "subs %w2, %w2, #8 \n" // 8 processed per loop. 816 ARGB1555TOARGB 817 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 818 // pixels 819 "b.gt 1b \n" 820 : "+r"(src_argb1555), // %0 821 "+r"(dst_argb), // %1 822 "+r"(width) // %2 823 : 824 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 825 ); 826 } 827 828 #define ARGB4444TOARGB \ 829 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 830 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 831 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 832 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 833 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 834 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 835 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 836 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 837 "dup v0.2D, v2.D[1] \n" \ 838 "dup v1.2D, v3.D[1] \n" 839 840 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, 841 uint8* dst_argb, 939 842 int width) { 940 asm volatile ( 941 "1: \n" 942 MEMACCESS(0) 943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 944 "subs %w2, %w2, #8 \n" // 8 processed per loop. 945 ARGB4444TOARGB 946 MEMACCESS(1) 947 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 948 "b.gt 1b \n" 949 : "+r"(src_argb4444), // %0 950 "+r"(dst_argb), // %1 951 "+r"(width) // %2 952 : 953 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 954 ); 843 asm volatile( 844 "1: \n" 845 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 846 "subs %w2, %w2, #8 \n" // 8 processed per loop. 847 ARGB4444TOARGB 848 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 849 // pixels 850 "b.gt 1b \n" 851 : "+r"(src_argb4444), // %0 852 "+r"(dst_argb), // %1 853 "+r"(width) // %2 854 : 855 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 856 ); 955 857 } 956 858 957 859 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 958 asm volatile 959 "1: \n"960 MEMACCESS(0)961 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGBpixels962 "subs %w2, %w2, #8 \n" // 8 processed per loop.963 MEMACCESS(1)964 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels ofRGB24.965 "b.gt 1b \n"966 : "+r"(src_argb), // %0967 "+r"(dst_rgb24), // %1968 "+r"(width)// %2969 :970 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List971 );860 asm volatile( 861 "1: \n" 862 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB 863 // pixels 864 "subs %w2, %w2, #8 \n" // 8 processed per loop. 865 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of 866 // RGB24. 867 "b.gt 1b \n" 868 : "+r"(src_argb), // %0 869 "+r"(dst_rgb24), // %1 870 "+r"(width) // %2 871 : 872 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 873 ); 972 874 } 973 875 974 876 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 975 asm volatile ( 976 "1: \n" 977 MEMACCESS(0) 978 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 979 "subs %w2, %w2, #8 \n" // 8 processed per loop. 980 "orr v4.8b, v2.8b, v2.8b \n" // mov g 981 "orr v5.8b, v1.8b, v1.8b \n" // mov b 982 MEMACCESS(1) 983 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 984 "b.gt 1b \n" 985 : "+r"(src_argb), // %0 986 "+r"(dst_raw), // %1 987 "+r"(width) // %2 988 : 989 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 990 ); 877 asm volatile( 878 "1: \n" 879 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 880 "subs %w2, %w2, #8 \n" // 8 processed per loop. 881 "orr v4.8b, v2.8b, v2.8b \n" // mov g 882 "orr v5.8b, v1.8b, v1.8b \n" // mov b 883 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 884 "b.gt 1b \n" 885 : "+r"(src_argb), // %0 886 "+r"(dst_raw), // %1 887 "+r"(width) // %2 888 : 889 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 890 ); 991 891 } 992 892 993 893 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 994 asm volatile ( 995 "1: \n" 996 MEMACCESS(0) 997 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 998 "subs %w2, %w2, #16 \n" // 16 processed per loop. 999 MEMACCESS(1) 1000 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1001 "b.gt 1b \n" 1002 : "+r"(src_yuy2), // %0 1003 "+r"(dst_y), // %1 1004 "+r"(width) // %2 1005 : 1006 : "cc", "memory", "v0", "v1" // Clobber List 1007 ); 894 asm volatile( 895 "1: \n" 896 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 897 "subs %w2, %w2, #16 \n" // 16 processed per loop. 898 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 899 "b.gt 1b \n" 900 : "+r"(src_yuy2), // %0 901 "+r"(dst_y), // %1 902 "+r"(width) // %2 903 : 904 : "cc", "memory", "v0", "v1" // Clobber List 905 ); 1008 906 } 1009 907 1010 908 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 1011 asm volatile 1012 "1: \n"1013 MEMACCESS(0)1014 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.1015 "subs %w2, %w2, #16 \n" // 16 processed per loop.1016 MEMACCESS(1)1017 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.1018 "b.gt 1b \n"1019 : "+r"(src_uyvy), // %01020 "+r"(dst_y), // %11021 "+r"(width) // %21022 :1023 : "cc", "memory", "v0", "v1" // Clobber List 1024 ); 1025 } 1026 1027 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u,uint8* dst_v,909 asm volatile( 910 "1: \n" 911 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 912 "subs %w2, %w2, #16 \n" // 16 processed per loop. 913 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 914 "b.gt 1b \n" 915 : "+r"(src_uyvy), // %0 916 "+r"(dst_y), // %1 917 "+r"(width) // %2 918 : 919 : "cc", "memory", "v0", "v1" // Clobber List 920 ); 921 } 922 923 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, 924 uint8* dst_u, 925 uint8* dst_v, 1028 926 int width) { 1029 asm volatile 1030 "1: \n"1031 MEMACCESS(0)1032 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2pixels1033 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.1034 MEMACCESS(1)1035 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.1036 MEMACCESS(2)1037 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.1038 "b.gt 1b \n"1039 : "+r"(src_yuy2), // %01040 "+r"(dst_u), // %11041 "+r"(dst_v), // %21042 "+r"(width) // %31043 :1044 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1045 ); 1046 } 1047 1048 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u,uint8* dst_v,927 asm volatile( 928 "1: \n" 929 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 930 // pixels 931 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 932 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 933 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 934 "b.gt 1b \n" 935 : "+r"(src_yuy2), // %0 936 "+r"(dst_u), // %1 937 "+r"(dst_v), // %2 938 "+r"(width) // %3 939 : 940 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 941 ); 942 } 943 944 void UYVYToUV422Row_NEON(const uint8* src_uyvy, 945 uint8* dst_u, 946 uint8* dst_v, 1049 947 int width) { 1050 asm volatile ( 1051 "1: \n" 1052 MEMACCESS(0) 1053 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1054 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1055 MEMACCESS(1) 1056 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1057 MEMACCESS(2) 1058 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1059 "b.gt 1b \n" 1060 : "+r"(src_uyvy), // %0 1061 "+r"(dst_u), // %1 1062 "+r"(dst_v), // %2 1063 "+r"(width) // %3 1064 : 1065 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1066 ); 1067 } 1068 1069 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1070 uint8* dst_u, uint8* dst_v, int width) { 948 asm volatile( 949 "1: \n" 950 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY 951 // pixels 952 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 953 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 954 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 955 "b.gt 1b \n" 956 : "+r"(src_uyvy), // %0 957 "+r"(dst_u), // %1 958 "+r"(dst_v), // %2 959 "+r"(width) // %3 960 : 961 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 962 ); 963 } 964 965 void YUY2ToUVRow_NEON(const uint8* src_yuy2, 966 int stride_yuy2, 967 uint8* dst_u, 968 uint8* dst_v, 969 int width) { 1071 970 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1072 asm volatile ( 1073 "1: \n" 1074 MEMACCESS(0) 1075 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1076 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1077 MEMACCESS(1) 1078 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1079 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1080 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1081 MEMACCESS(2) 1082 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1083 MEMACCESS(3) 1084 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1085 "b.gt 1b \n" 1086 : "+r"(src_yuy2), // %0 1087 "+r"(src_yuy2b), // %1 1088 "+r"(dst_u), // %2 1089 "+r"(dst_v), // %3 1090 "+r"(width) // %4 1091 : 1092 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1093 "v5", "v6", "v7" // Clobber List 1094 ); 1095 } 1096 1097 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1098 uint8* dst_u, uint8* dst_v, int width) { 971 asm volatile( 972 "1: \n" 973 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 974 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 975 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 976 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 977 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 978 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 979 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 980 "b.gt 1b \n" 981 : "+r"(src_yuy2), // %0 982 "+r"(src_yuy2b), // %1 983 "+r"(dst_u), // %2 984 "+r"(dst_v), // %3 985 "+r"(width) // %4 986 : 987 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 988 "v7" // Clobber List 989 ); 990 } 991 992 void UYVYToUVRow_NEON(const uint8* src_uyvy, 993 int stride_uyvy, 994 uint8* dst_u, 995 uint8* dst_v, 996 int width) { 1099 997 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1100 asm volatile ( 1101 "1: \n" 1102 MEMACCESS(0) 1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1104 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1105 MEMACCESS(1) 1106 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1107 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1108 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1109 MEMACCESS(2) 1110 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1111 MEMACCESS(3) 1112 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1113 "b.gt 1b \n" 1114 : "+r"(src_uyvy), // %0 1115 "+r"(src_uyvyb), // %1 1116 "+r"(dst_u), // %2 1117 "+r"(dst_v), // %3 1118 "+r"(width) // %4 1119 : 1120 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1121 "v5", "v6", "v7" // Clobber List 1122 ); 998 asm volatile( 999 "1: \n" 1000 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1001 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1002 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1003 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1004 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1005 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1006 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1007 "b.gt 1b \n" 1008 : "+r"(src_uyvy), // %0 1009 "+r"(src_uyvyb), // %1 1010 "+r"(dst_u), // %2 1011 "+r"(dst_v), // %3 1012 "+r"(width) // %4 1013 : 1014 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1015 "v7" // Clobber List 1016 ); 1123 1017 } 1124 1018 1125 1019 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1126 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1127 const uint8* shuffler, int width) { 1128 asm volatile ( 1129 MEMACCESS(3) 1130 "ld1 {v2.16b}, [%3] \n" // shuffler 1131 "1: \n" 1132 MEMACCESS(0) 1133 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1134 "subs %w2, %w2, #4 \n" // 4 processed per loop 1135 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1136 MEMACCESS(1) 1137 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1138 "b.gt 1b \n" 1139 : "+r"(src_argb), // %0 1140 "+r"(dst_argb), // %1 1141 "+r"(width) // %2 1142 : "r"(shuffler) // %3 1143 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1144 ); 1020 void ARGBShuffleRow_NEON(const uint8* src_argb, 1021 uint8* dst_argb, 1022 const uint8* shuffler, 1023 int width) { 1024 asm volatile( 1025 "ld1 {v2.16b}, [%3] \n" // shuffler 1026 "1: \n" 1027 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1028 "subs %w2, %w2, #4 \n" // 4 processed per loop 1029 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1030 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1031 "b.gt 1b \n" 1032 : "+r"(src_argb), // %0 1033 "+r"(dst_argb), // %1 1034 "+r"(width) // %2 1035 : "r"(shuffler) // %3 1036 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1037 ); 1145 1038 } 1146 1039 … … 1148 1041 const uint8* src_u, 1149 1042 const uint8* src_v, 1150 uint8* dst_yuy2, int width) { 1151 asm volatile ( 1152 "1: \n" 1153 MEMACCESS(0) 1154 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1155 "orr v2.8b, v1.8b, v1.8b \n" 1156 MEMACCESS(1) 1157 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1158 MEMACCESS(2) 1159 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1160 "subs %w4, %w4, #16 \n" // 16 pixels 1161 MEMACCESS(3) 1162 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1163 "b.gt 1b \n" 1164 : "+r"(src_y), // %0 1165 "+r"(src_u), // %1 1166 "+r"(src_v), // %2 1167 "+r"(dst_yuy2), // %3 1168 "+r"(width) // %4 1169 : 1170 : "cc", "memory", "v0", "v1", "v2", "v3" 1171 ); 1043 uint8* dst_yuy2, 1044 int width) { 1045 asm volatile( 1046 "1: \n" 1047 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1048 "orr v2.8b, v1.8b, v1.8b \n" 1049 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1050 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1051 "subs %w4, %w4, #16 \n" // 16 pixels 1052 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1053 "b.gt 1b \n" 1054 : "+r"(src_y), // %0 1055 "+r"(src_u), // %1 1056 "+r"(src_v), // %2 1057 "+r"(dst_yuy2), // %3 1058 "+r"(width) // %4 1059 : 1060 : "cc", "memory", "v0", "v1", "v2", "v3"); 1172 1061 } 1173 1062 … … 1175 1064 const uint8* src_u, 1176 1065 const uint8* src_v, 1177 uint8* dst_uyvy, int width) { 1178 asm volatile ( 1179 "1: \n" 1180 MEMACCESS(0) 1181 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1182 "orr v3.8b, v2.8b, v2.8b \n" 1183 MEMACCESS(1) 1184 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1185 MEMACCESS(2) 1186 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1187 "subs %w4, %w4, #16 \n" // 16 pixels 1188 MEMACCESS(3) 1189 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1190 "b.gt 1b \n" 1191 : "+r"(src_y), // %0 1192 "+r"(src_u), // %1 1193 "+r"(src_v), // %2 1194 "+r"(dst_uyvy), // %3 1195 "+r"(width) // %4 1196 : 1197 : "cc", "memory", "v0", "v1", "v2", "v3" 1198 ); 1066 uint8* dst_uyvy, 1067 int width) { 1068 asm volatile( 1069 "1: \n" 1070 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1071 "orr v3.8b, v2.8b, v2.8b \n" 1072 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1073 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1074 "subs %w4, %w4, #16 \n" // 16 pixels 1075 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1076 "b.gt 1b \n" 1077 : "+r"(src_y), // %0 1078 "+r"(src_u), // %1 1079 "+r"(src_v), // %2 1080 "+r"(dst_uyvy), // %3 1081 "+r"(width) // %4 1082 : 1083 : "cc", "memory", "v0", "v1", "v2", "v3"); 1199 1084 } 1200 1085 1201 1086 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1202 asm volatile ( 1203 "1: \n" 1204 MEMACCESS(0) 1205 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1206 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1207 ARGBTORGB565 1208 MEMACCESS(1) 1209 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1210 "b.gt 1b \n" 1211 : "+r"(src_argb), // %0 1212 "+r"(dst_rgb565), // %1 1213 "+r"(width) // %2 1214 : 1215 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1216 ); 1217 } 1218 1219 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, 1220 const uint32 dither4, int width) { 1221 asm volatile ( 1222 "dup v1.4s, %w2 \n" // dither4 1223 "1: \n" 1224 MEMACCESS(1) 1225 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1226 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1227 "uqadd v20.8b, v20.8b, v1.8b \n" 1228 "uqadd v21.8b, v21.8b, v1.8b \n" 1229 "uqadd v22.8b, v22.8b, v1.8b \n" 1230 ARGBTORGB565 1231 MEMACCESS(0) 1232 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1233 "b.gt 1b \n" 1234 : "+r"(dst_rgb) // %0 1235 : "r"(src_argb), // %1 1236 "r"(dither4), // %2 1237 "r"(width) // %3 1238 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1239 ); 1240 } 1241 1242 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1087 asm volatile( 1088 "1: \n" 1089 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1090 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1091 ARGBTORGB565 1092 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1093 "b.gt 1b \n" 1094 : "+r"(src_argb), // %0 1095 "+r"(dst_rgb565), // %1 1096 "+r"(width) // %2 1097 : 1098 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); 1099 } 1100 1101 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, 1102 uint8* dst_rgb, 1103 const uint32 dither4, 1104 int width) { 1105 asm volatile( 1106 "dup v1.4s, %w2 \n" // dither4 1107 "1: \n" 1108 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1109 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1110 "uqadd v20.8b, v20.8b, v1.8b \n" 1111 "uqadd v21.8b, v21.8b, v1.8b \n" 1112 "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 1113 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1114 "b.gt 1b \n" 1115 : "+r"(dst_rgb) // %0 1116 : "r"(src_argb), // %1 1117 "r"(dither4), // %2 1118 "r"(width) // %3 1119 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); 1120 } 1121 1122 void ARGBToARGB1555Row_NEON(const uint8* src_argb, 1123 uint8* dst_argb1555, 1243 1124 int width) { 1244 asm volatile ( 1245 "1: \n" 1246 MEMACCESS(0) 1247 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1248 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1249 ARGBTOARGB1555 1250 MEMACCESS(1) 1251 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1252 "b.gt 1b \n" 1253 : "+r"(src_argb), // %0 1254 "+r"(dst_argb1555), // %1 1255 "+r"(width) // %2 1256 : 1257 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1258 ); 1259 } 1260 1261 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1125 asm volatile( 1126 "1: \n" 1127 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1128 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1129 ARGBTOARGB1555 1130 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels 1131 // ARGB1555. 1132 "b.gt 1b \n" 1133 : "+r"(src_argb), // %0 1134 "+r"(dst_argb1555), // %1 1135 "+r"(width) // %2 1136 : 1137 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); 1138 } 1139 1140 void ARGBToARGB4444Row_NEON(const uint8* src_argb, 1141 uint8* dst_argb4444, 1262 1142 int width) { 1263 asm volatile ( 1264 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1265 "1: \n" 1266 MEMACCESS(0) 1267 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1268 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1269 ARGBTOARGB4444 1270 MEMACCESS(1) 1271 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1272 "b.gt 1b \n" 1273 : "+r"(src_argb), // %0 1274 "+r"(dst_argb4444), // %1 1275 "+r"(width) // %2 1276 : 1277 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1278 ); 1143 asm volatile( 1144 "movi v4.16b, #0x0f \n" // bits to clear with 1145 // vbic. 1146 "1: \n" 1147 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1148 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1149 ARGBTOARGB4444 1150 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels 1151 // ARGB4444. 1152 "b.gt 1b \n" 1153 : "+r"(src_argb), // %0 1154 "+r"(dst_argb4444), // %1 1155 "+r"(width) // %2 1156 : 1157 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); 1279 1158 } 1280 1159 1281 1160 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1282 asm volatile ( 1283 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1285 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1286 "movi v7.8b, #16 \n" // Add 16 constant 1287 "1: \n" 1288 MEMACCESS(0) 1289 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1290 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1291 "umull v3.8h, v0.8b, v4.8b \n" // B 1292 "umlal v3.8h, v1.8b, v5.8b \n" // G 1293 "umlal v3.8h, v2.8b, v6.8b \n" // R 1294 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1295 "uqadd v0.8b, v0.8b, v7.8b \n" 1296 MEMACCESS(1) 1297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1298 "b.gt 1b \n" 1299 : "+r"(src_argb), // %0 1300 "+r"(dst_y), // %1 1301 "+r"(width) // %2 1302 : 1303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1304 ); 1161 asm volatile( 1162 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1163 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1164 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1165 "movi v7.8b, #16 \n" // Add 16 constant 1166 "1: \n" 1167 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1168 // pixels. 1169 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1170 "umull v3.8h, v0.8b, v4.8b \n" // B 1171 "umlal v3.8h, v1.8b, v5.8b \n" // G 1172 "umlal v3.8h, v2.8b, v6.8b \n" // R 1173 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1174 "uqadd v0.8b, v0.8b, v7.8b \n" 1175 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1176 "b.gt 1b \n" 1177 : "+r"(src_argb), // %0 1178 "+r"(dst_y), // %1 1179 "+r"(width) // %2 1180 : 1181 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 1305 1182 } 1306 1183 1307 1184 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1308 asm volatile ( 1309 "1: \n" 1310 MEMACCESS(0) 1311 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels 1312 "subs %w2, %w2, #16 \n" // 16 processed per loop 1313 MEMACCESS(1) 1314 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. 1315 "b.gt 1b \n" 1316 : "+r"(src_argb), // %0 1317 "+r"(dst_a), // %1 1318 "+r"(width) // %2 1319 : 1320 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1321 ); 1185 asm volatile( 1186 "1: \n" 1187 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 1188 // pixels 1189 "subs %w2, %w2, #16 \n" // 16 processed per loop 1190 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. 1191 "b.gt 1b \n" 1192 : "+r"(src_argb), // %0 1193 "+r"(dst_a), // %1 1194 "+r"(width) // %2 1195 : 1196 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1197 ); 1322 1198 } 1323 1199 1324 1200 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1325 asm volatile ( 1326 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1327 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1328 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1329 "1: \n" 1330 MEMACCESS(0) 1331 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1332 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1333 "umull v3.8h, v0.8b, v4.8b \n" // B 1334 "umlal v3.8h, v1.8b, v5.8b \n" // G 1335 "umlal v3.8h, v2.8b, v6.8b \n" // R 1336 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1337 MEMACCESS(1) 1338 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1339 "b.gt 1b \n" 1340 : "+r"(src_argb), // %0 1341 "+r"(dst_y), // %1 1342 "+r"(width) // %2 1343 : 1344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1345 ); 1201 asm volatile( 1202 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1203 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1204 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1205 "1: \n" 1206 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1207 // pixels. 1208 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1209 "umull v3.8h, v0.8b, v4.8b \n" // B 1210 "umlal v3.8h, v1.8b, v5.8b \n" // G 1211 "umlal v3.8h, v2.8b, v6.8b \n" // R 1212 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1213 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1214 "b.gt 1b \n" 1215 : "+r"(src_argb), // %0 1216 "+r"(dst_y), // %1 1217 "+r"(width) // %2 1218 : 1219 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); 1346 1220 } 1347 1221 1348 1222 // 8x1 pixels. 1349 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1223 void ARGBToUV444Row_NEON(const uint8* src_argb, 1224 uint8* dst_u, 1225 uint8* dst_v, 1350 1226 int width) { 1351 asm volatile ( 1352 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1353 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1354 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1355 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1356 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1357 "movi v29.16b,#0x80 \n" // 128.5 1358 "1: \n" 1359 MEMACCESS(0) 1360 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1361 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1362 "umull v4.8h, v0.8b, v24.8b \n" // B 1363 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1364 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1365 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1366 1367 "umull v3.8h, v2.8b, v24.8b \n" // R 1368 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1369 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1370 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1371 1372 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U 1373 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1374 1375 MEMACCESS(1) 1376 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1377 MEMACCESS(2) 1378 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1379 "b.gt 1b \n" 1380 : "+r"(src_argb), // %0 1381 "+r"(dst_u), // %1 1382 "+r"(dst_v), // %2 1383 "+r"(width) // %3 1384 : 1385 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1386 "v24", "v25", "v26", "v27", "v28", "v29" 1387 ); 1388 } 1389 1390 #define RGBTOUV_SETUP_REG \ 1391 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 1392 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 1393 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 1394 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 1395 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 1396 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 1397 1398 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. 1399 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1400 int width) { 1227 asm volatile( 1228 "movi v24.8b, #112 \n" // UB / VR 0.875 1229 // coefficient 1230 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1231 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1232 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1233 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1234 "movi v29.16b,#0x80 \n" // 128.5 1235 "1: \n" 1236 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 1237 // pixels. 1238 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1239 "umull v4.8h, v0.8b, v24.8b \n" // B 1240 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1241 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1242 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1243 1244 "umull v3.8h, v2.8b, v24.8b \n" // R 1245 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1246 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1247 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1248 1249 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U 1250 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1251 1252 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1253 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1254 "b.gt 1b \n" 1255 : "+r"(src_argb), // %0 1256 "+r"(dst_u), // %1 1257 "+r"(dst_v), // %2 1258 "+r"(width) // %3 1259 : 1260 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", 1261 "v27", "v28", "v29"); 1262 } 1263 1264 #define RGBTOUV_SETUP_REG \ 1265 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 1266 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 1267 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 1268 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 1269 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 1270 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 1271 1272 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1273 #define RGBTOUV(QB, QG, QR) \ 1274 "mul v3.8h, " #QB \ 1275 ",v20.8h \n" /* B */ \ 1276 "mul v4.8h, " #QR \ 1277 ",v20.8h \n" /* R */ \ 1278 "mls v3.8h, " #QG \ 1279 ",v21.8h \n" /* G */ \ 1280 "mls v4.8h, " #QG \ 1281 ",v24.8h \n" /* G */ \ 1282 "mls v3.8h, " #QR \ 1283 ",v22.8h \n" /* R */ \ 1284 "mls v4.8h, " #QB \ 1285 ",v23.8h \n" /* B */ \ 1286 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1287 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1288 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1289 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1290 1291 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1292 // TODO(fbarchard): consider ptrdiff_t for all strides. 1293 1294 void ARGBToUVRow_NEON(const uint8* src_argb, 1295 int src_stride_argb, 1296 uint8* dst_u, 1297 uint8* dst_v, 1298 int width) { 1299 const uint8* src_argb_1 = src_argb + src_stride_argb; 1401 1300 asm volatile ( 1402 1301 RGBTOUV_SETUP_REG 1403 1302 "1: \n" 1404 MEMACCESS(0)1405 1303 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1406 1304 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1407 1305 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1408 1306 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1409 MEMACCESS(0) 1410 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. 1411 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1412 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1413 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1414 1415 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. 1416 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. 1417 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. 1418 1419 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1420 "urshr v1.8h, v1.8h, #1 \n" 1421 "urshr v2.8h, v2.8h, #1 \n" 1422 1423 "subs %w3, %w3, #32 \n" // 32 processed per loop. 1424 "mul v3.8h, v0.8h, v20.8h \n" // B 1425 "mls v3.8h, v1.8h, v21.8h \n" // G 1426 "mls v3.8h, v2.8h, v22.8h \n" // R 1427 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1428 "mul v4.8h, v2.8h, v20.8h \n" // R 1429 "mls v4.8h, v1.8h, v24.8h \n" // G 1430 "mls v4.8h, v0.8h, v23.8h \n" // B 1431 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1432 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1433 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1434 MEMACCESS(1) 1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1436 MEMACCESS(2) 1437 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1438 "b.gt 1b \n" 1439 : "+r"(src_argb), // %0 1440 "+r"(dst_u), // %1 1441 "+r"(dst_v), // %2 1442 "+r"(width) // %3 1443 : 1444 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1445 "v20", "v21", "v22", "v23", "v24", "v25" 1446 ); 1447 } 1448 1449 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1450 #define RGBTOUV(QB, QG, QR) \ 1451 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1452 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1453 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1454 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1455 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1456 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1457 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1458 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1459 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1460 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1461 1462 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1463 // TODO(fbarchard): consider ptrdiff_t for all strides. 1464 1465 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1466 uint8* dst_u, uint8* dst_v, int width) { 1467 const uint8* src_argb_1 = src_argb + src_stride_argb; 1468 asm volatile ( 1469 RGBTOUV_SETUP_REG 1470 "1: \n" 1471 MEMACCESS(0) 1472 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1473 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1474 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1475 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1476 1477 MEMACCESS(1) 1307 1478 1308 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1479 1309 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. … … 1487 1317 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1488 1318 RGBTOUV(v0.8h, v1.8h, v2.8h) 1489 MEMACCESS(2)1490 1319 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1491 MEMACCESS(3)1492 1320 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1493 1321 "b.gt 1b \n" … … 1504 1332 1505 1333 // TODO(fbarchard): Subsample match C code. 1506 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1507 uint8* dst_u, uint8* dst_v, int width) { 1334 void ARGBToUVJRow_NEON(const uint8* src_argb, 1335 int src_stride_argb, 1336 uint8* dst_u, 1337 uint8* dst_v, 1338 int width) { 1508 1339 const uint8* src_argb_1 = src_argb + src_stride_argb; 1509 1340 asm volatile ( … … 1515 1346 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1516 1347 "1: \n" 1517 MEMACCESS(0)1518 1348 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1519 1349 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1520 1350 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1521 1351 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1522 MEMACCESS(1)1523 1352 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1524 1353 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. … … 1532 1361 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1533 1362 RGBTOUV(v0.8h, v1.8h, v2.8h) 1534 MEMACCESS(2)1535 1363 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1536 MEMACCESS(3)1537 1364 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1538 1365 "b.gt 1b \n" … … 1548 1375 } 1549 1376 1550 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1551 uint8* dst_u, uint8* dst_v, int width) { 1377 void BGRAToUVRow_NEON(const uint8* src_bgra, 1378 int src_stride_bgra, 1379 uint8* dst_u, 1380 uint8* dst_v, 1381 int width) { 1552 1382 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1553 1383 asm volatile ( 1554 1384 RGBTOUV_SETUP_REG 1555 1385 "1: \n" 1556 MEMACCESS(0)1557 1386 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1558 1387 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1559 1388 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1560 1389 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. 1561 MEMACCESS(1)1562 1390 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more 1563 1391 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. … … 1571 1399 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1572 1400 RGBTOUV(v0.8h, v1.8h, v2.8h) 1573 MEMACCESS(2)1574 1401 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1575 MEMACCESS(3)1576 1402 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1577 1403 "b.gt 1b \n" … … 1587 1413 } 1588 1414 1589 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1590 uint8* dst_u, uint8* dst_v, int width) { 1415 void ABGRToUVRow_NEON(const uint8* src_abgr, 1416 int src_stride_abgr, 1417 uint8* dst_u, 1418 uint8* dst_v, 1419 int width) { 1591 1420 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1592 1421 asm volatile ( 1593 1422 RGBTOUV_SETUP_REG 1594 1423 "1: \n" 1595 MEMACCESS(0)1596 1424 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1597 1425 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1598 1426 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1599 1427 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1600 MEMACCESS(1)1601 1428 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1602 1429 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. … … 1610 1437 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1611 1438 RGBTOUV(v0.8h, v2.8h, v1.8h) 1612 MEMACCESS(2)1613 1439 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1614 MEMACCESS(3)1615 1440 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1616 1441 "b.gt 1b \n" … … 1626 1451 } 1627 1452 1628 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1629 uint8* dst_u, uint8* dst_v, int width) { 1453 void RGBAToUVRow_NEON(const uint8* src_rgba, 1454 int src_stride_rgba, 1455 uint8* dst_u, 1456 uint8* dst_v, 1457 int width) { 1630 1458 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1631 1459 asm volatile ( 1632 1460 RGBTOUV_SETUP_REG 1633 1461 "1: \n" 1634 MEMACCESS(0)1635 1462 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1636 1463 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1637 1464 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1638 1465 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. 1639 MEMACCESS(1)1640 1466 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1641 1467 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. … … 1649 1475 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1650 1476 RGBTOUV(v0.8h, v1.8h, v2.8h) 1651 MEMACCESS(2)1652 1477 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1653 MEMACCESS(3)1654 1478 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1655 1479 "b.gt 1b \n" … … 1665 1489 } 1666 1490 1667 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1668 uint8* dst_u, uint8* dst_v, int width) { 1491 void RGB24ToUVRow_NEON(const uint8* src_rgb24, 1492 int src_stride_rgb24, 1493 uint8* dst_u, 1494 uint8* dst_v, 1495 int width) { 1669 1496 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1670 1497 asm volatile ( 1671 1498 RGBTOUV_SETUP_REG 1672 1499 "1: \n" 1673 MEMACCESS(0)1674 1500 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1675 1501 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1676 1502 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1677 1503 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1678 MEMACCESS(1)1679 1504 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. 1680 1505 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. … … 1688 1513 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1689 1514 RGBTOUV(v0.8h, v1.8h, v2.8h) 1690 MEMACCESS(2)1691 1515 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1692 MEMACCESS(3)1693 1516 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1694 1517 "b.gt 1b \n" … … 1704 1527 } 1705 1528 1706 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1707 uint8* dst_u, uint8* dst_v, int width) { 1529 void RAWToUVRow_NEON(const uint8* src_raw, 1530 int src_stride_raw, 1531 uint8* dst_u, 1532 uint8* dst_v, 1533 int width) { 1708 1534 const uint8* src_raw_1 = src_raw + src_stride_raw; 1709 1535 asm volatile ( 1710 1536 RGBTOUV_SETUP_REG 1711 1537 "1: \n" 1712 MEMACCESS(0)1713 1538 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1714 1539 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1715 1540 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1716 1541 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1717 MEMACCESS(1)1718 1542 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels 1719 1543 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. … … 1727 1551 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1728 1552 RGBTOUV(v2.8h, v1.8h, v0.8h) 1729 MEMACCESS(2)1730 1553 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1731 MEMACCESS(3)1732 1554 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1733 1555 "b.gt 1b \n" … … 1744 1566 1745 1567 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1746 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1747 uint8* dst_u, uint8* dst_v, int width) { 1568 void RGB565ToUVRow_NEON(const uint8* src_rgb565, 1569 int src_stride_rgb565, 1570 uint8* dst_u, 1571 uint8* dst_v, 1572 int width) { 1748 1573 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1749 asm volatile ( 1750 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1751 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1752 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1753 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1754 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1755 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1756 "1: \n" 1757 MEMACCESS(0) 1758 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1759 RGB565TOARGB 1760 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1761 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1762 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1763 MEMACCESS(0) 1764 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. 1765 RGB565TOARGB 1766 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1767 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1768 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1769 1770 MEMACCESS(1) 1771 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. 1772 RGB565TOARGB 1773 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1774 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1775 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1776 MEMACCESS(1) 1777 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. 1778 RGB565TOARGB 1779 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1780 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1781 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1782 1783 "ins v16.D[1], v17.D[0] \n" 1784 "ins v18.D[1], v19.D[0] \n" 1785 "ins v20.D[1], v21.D[0] \n" 1786 1787 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1788 "urshr v5.8h, v18.8h, #1 \n" 1789 "urshr v6.8h, v20.8h, #1 \n" 1790 1791 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1792 "mul v16.8h, v4.8h, v22.8h \n" // B 1793 "mls v16.8h, v5.8h, v23.8h \n" // G 1794 "mls v16.8h, v6.8h, v24.8h \n" // R 1795 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1796 "mul v17.8h, v6.8h, v22.8h \n" // R 1797 "mls v17.8h, v5.8h, v26.8h \n" // G 1798 "mls v17.8h, v4.8h, v25.8h \n" // B 1799 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1800 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1801 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1802 MEMACCESS(2) 1803 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1804 MEMACCESS(3) 1805 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1806 "b.gt 1b \n" 1807 : "+r"(src_rgb565), // %0 1808 "+r"(src_rgb565_1), // %1 1809 "+r"(dst_u), // %2 1810 "+r"(dst_v), // %3 1811 "+r"(width) // %4 1812 : 1813 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1814 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 1815 "v25", "v26", "v27" 1816 ); 1574 asm volatile( 1575 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 1576 // 2 1577 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1578 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1579 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1580 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1581 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 1582 // 16-bit) 1583 "1: \n" 1584 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1585 RGB565TOARGB 1586 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1587 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1588 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1589 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. 1590 RGB565TOARGB 1591 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1592 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1593 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1594 1595 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. 1596 RGB565TOARGB 1597 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1598 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1599 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1600 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. 1601 RGB565TOARGB 1602 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1603 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1604 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1605 1606 "ins v16.D[1], v17.D[0] \n" 1607 "ins v18.D[1], v19.D[0] \n" 1608 "ins v20.D[1], v21.D[0] \n" 1609 1610 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1611 "urshr v5.8h, v18.8h, #1 \n" 1612 "urshr v6.8h, v20.8h, #1 \n" 1613 1614 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1615 "mul v16.8h, v4.8h, v22.8h \n" // B 1616 "mls v16.8h, v5.8h, v23.8h \n" // G 1617 "mls v16.8h, v6.8h, v24.8h \n" // R 1618 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1619 "mul v17.8h, v6.8h, v22.8h \n" // R 1620 "mls v17.8h, v5.8h, v26.8h \n" // G 1621 "mls v17.8h, v4.8h, v25.8h \n" // B 1622 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1623 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1624 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1625 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1626 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1627 "b.gt 1b \n" 1628 : "+r"(src_rgb565), // %0 1629 "+r"(src_rgb565_1), // %1 1630 "+r"(dst_u), // %2 1631 "+r"(dst_v), // %3 1632 "+r"(width) // %4 1633 : 1634 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 1635 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", 1636 "v27"); 1817 1637 } 1818 1638 1819 1639 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1820 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 1821 uint8* dst_u, uint8* dst_v, int width) { 1640 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, 1641 int src_stride_argb1555, 1642 uint8* dst_u, 1643 uint8* dst_v, 1644 int width) { 1822 1645 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 1823 asm volatile ( 1824 RGBTOUV_SETUP_REG 1825 "1: \n" 1826 MEMACCESS(0) 1827 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1828 RGB555TOARGB 1829 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1830 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1831 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1832 MEMACCESS(0) 1833 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. 1834 RGB555TOARGB 1835 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1836 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1837 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1838 1839 MEMACCESS(1) 1840 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. 1841 RGB555TOARGB 1842 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1843 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1844 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1845 MEMACCESS(1) 1846 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. 1847 RGB555TOARGB 1848 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1849 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1850 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1851 1852 "ins v16.D[1], v26.D[0] \n" 1853 "ins v17.D[1], v27.D[0] \n" 1854 "ins v18.D[1], v28.D[0] \n" 1855 1856 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1857 "urshr v5.8h, v17.8h, #1 \n" 1858 "urshr v6.8h, v18.8h, #1 \n" 1859 1860 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1861 "mul v2.8h, v4.8h, v20.8h \n" // B 1862 "mls v2.8h, v5.8h, v21.8h \n" // G 1863 "mls v2.8h, v6.8h, v22.8h \n" // R 1864 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1865 "mul v3.8h, v6.8h, v20.8h \n" // R 1866 "mls v3.8h, v5.8h, v24.8h \n" // G 1867 "mls v3.8h, v4.8h, v23.8h \n" // B 1868 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1869 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1870 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1871 MEMACCESS(2) 1872 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1873 MEMACCESS(3) 1874 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1875 "b.gt 1b \n" 1876 : "+r"(src_argb1555), // %0 1877 "+r"(src_argb1555_1), // %1 1878 "+r"(dst_u), // %2 1879 "+r"(dst_v), // %3 1880 "+r"(width) // %4 1881 : 1882 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1883 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1884 "v26", "v27", "v28" 1885 ); 1646 asm volatile( 1647 RGBTOUV_SETUP_REG 1648 "1: \n" 1649 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1650 RGB555TOARGB 1651 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1652 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1653 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1654 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. 1655 RGB555TOARGB 1656 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1657 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1658 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1659 1660 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. 1661 RGB555TOARGB 1662 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1663 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1664 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1665 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. 1666 RGB555TOARGB 1667 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1668 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1669 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1670 1671 "ins v16.D[1], v26.D[0] \n" 1672 "ins v17.D[1], v27.D[0] \n" 1673 "ins v18.D[1], v28.D[0] \n" 1674 1675 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1676 "urshr v5.8h, v17.8h, #1 \n" 1677 "urshr v6.8h, v18.8h, #1 \n" 1678 1679 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1680 "mul v2.8h, v4.8h, v20.8h \n" // B 1681 "mls v2.8h, v5.8h, v21.8h \n" // G 1682 "mls v2.8h, v6.8h, v22.8h \n" // R 1683 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1684 "mul v3.8h, v6.8h, v20.8h \n" // R 1685 "mls v3.8h, v5.8h, v24.8h \n" // G 1686 "mls v3.8h, v4.8h, v23.8h \n" // B 1687 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1688 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1689 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1690 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1691 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1692 "b.gt 1b \n" 1693 : "+r"(src_argb1555), // %0 1694 "+r"(src_argb1555_1), // %1 1695 "+r"(dst_u), // %2 1696 "+r"(dst_v), // %3 1697 "+r"(width) // %4 1698 : 1699 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", 1700 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 1701 "v28"); 1886 1702 } 1887 1703 1888 1704 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1889 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 1890 uint8* dst_u, uint8* dst_v, int width) { 1705 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, 1706 int src_stride_argb4444, 1707 uint8* dst_u, 1708 uint8* dst_v, 1709 int width) { 1891 1710 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 1892 asm volatile ( 1893 RGBTOUV_SETUP_REG 1894 "1: \n" 1895 MEMACCESS(0) 1896 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1897 ARGB4444TOARGB 1898 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1899 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1900 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1901 MEMACCESS(0) 1902 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. 1903 ARGB4444TOARGB 1904 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1905 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1906 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1907 1908 MEMACCESS(1) 1909 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. 1910 ARGB4444TOARGB 1911 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1912 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1913 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1914 MEMACCESS(1) 1915 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. 1916 ARGB4444TOARGB 1917 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1918 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1919 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1920 1921 "ins v16.D[1], v26.D[0] \n" 1922 "ins v17.D[1], v27.D[0] \n" 1923 "ins v18.D[1], v28.D[0] \n" 1924 1925 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1926 "urshr v5.8h, v17.8h, #1 \n" 1927 "urshr v6.8h, v18.8h, #1 \n" 1928 1929 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1930 "mul v2.8h, v4.8h, v20.8h \n" // B 1931 "mls v2.8h, v5.8h, v21.8h \n" // G 1932 "mls v2.8h, v6.8h, v22.8h \n" // R 1933 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1934 "mul v3.8h, v6.8h, v20.8h \n" // R 1935 "mls v3.8h, v5.8h, v24.8h \n" // G 1936 "mls v3.8h, v4.8h, v23.8h \n" // B 1937 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1938 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1939 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1940 MEMACCESS(2) 1941 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1942 MEMACCESS(3) 1943 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1944 "b.gt 1b \n" 1945 : "+r"(src_argb4444), // %0 1946 "+r"(src_argb4444_1), // %1 1947 "+r"(dst_u), // %2 1948 "+r"(dst_v), // %3 1949 "+r"(width) // %4 1950 : 1951 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1952 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1953 "v26", "v27", "v28" 1954 1955 ); 1711 asm volatile( 1712 RGBTOUV_SETUP_REG 1713 "1: \n" 1714 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1715 ARGB4444TOARGB 1716 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1717 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1718 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1719 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. 1720 ARGB4444TOARGB 1721 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1722 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1723 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1724 1725 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. 1726 ARGB4444TOARGB 1727 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1728 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1729 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1730 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. 1731 ARGB4444TOARGB 1732 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1733 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1734 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1735 1736 "ins v16.D[1], v26.D[0] \n" 1737 "ins v17.D[1], v27.D[0] \n" 1738 "ins v18.D[1], v28.D[0] \n" 1739 1740 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1741 "urshr v5.8h, v17.8h, #1 \n" 1742 "urshr v6.8h, v18.8h, #1 \n" 1743 1744 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1745 "mul v2.8h, v4.8h, v20.8h \n" // B 1746 "mls v2.8h, v5.8h, v21.8h \n" // G 1747 "mls v2.8h, v6.8h, v22.8h \n" // R 1748 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1749 "mul v3.8h, v6.8h, v20.8h \n" // R 1750 "mls v3.8h, v5.8h, v24.8h \n" // G 1751 "mls v3.8h, v4.8h, v23.8h \n" // B 1752 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1753 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1754 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1755 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1756 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1757 "b.gt 1b \n" 1758 : "+r"(src_argb4444), // %0 1759 "+r"(src_argb4444_1), // %1 1760 "+r"(dst_u), // %2 1761 "+r"(dst_v), // %3 1762 "+r"(width) // %4 1763 : 1764 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", 1765 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", 1766 "v28" 1767 1768 ); 1956 1769 } 1957 1770 1958 1771 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { 1959 asm volatile ( 1960 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1961 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1962 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1963 "movi v27.8b, #16 \n" // Add 16 constant 1964 "1: \n" 1965 MEMACCESS(0) 1966 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1967 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1968 RGB565TOARGB 1969 "umull v3.8h, v0.8b, v24.8b \n" // B 1970 "umlal v3.8h, v1.8b, v25.8b \n" // G 1971 "umlal v3.8h, v2.8b, v26.8b \n" // R 1972 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1973 "uqadd v0.8b, v0.8b, v27.8b \n" 1974 MEMACCESS(1) 1975 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1976 "b.gt 1b \n" 1977 : "+r"(src_rgb565), // %0 1978 "+r"(dst_y), // %1 1979 "+r"(width) // %2 1980 : 1981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 1982 "v24", "v25", "v26", "v27" 1983 ); 1772 asm volatile( 1773 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1774 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1775 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1776 "movi v27.8b, #16 \n" // Add 16 constant 1777 "1: \n" 1778 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1779 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1780 RGB565TOARGB 1781 "umull v3.8h, v0.8b, v24.8b \n" // B 1782 "umlal v3.8h, v1.8b, v25.8b \n" // G 1783 "umlal v3.8h, v2.8b, v26.8b \n" // R 1784 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1785 "uqadd v0.8b, v0.8b, v27.8b \n" 1786 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1787 "b.gt 1b \n" 1788 : "+r"(src_rgb565), // %0 1789 "+r"(dst_y), // %1 1790 "+r"(width) // %2 1791 : 1792 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", 1793 "v27"); 1984 1794 } 1985 1795 1986 1796 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { 1987 asm volatile ( 1988 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1989 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1990 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1991 "movi v7.8b, #16 \n" // Add 16 constant 1992 "1: \n" 1993 MEMACCESS(0) 1994 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1995 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1996 ARGB1555TOARGB 1997 "umull v3.8h, v0.8b, v4.8b \n" // B 1998 "umlal v3.8h, v1.8b, v5.8b \n" // G 1999 "umlal v3.8h, v2.8b, v6.8b \n" // R 2000 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2001 "uqadd v0.8b, v0.8b, v7.8b \n" 2002 MEMACCESS(1) 2003 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2004 "b.gt 1b \n" 2005 : "+r"(src_argb1555), // %0 2006 "+r"(dst_y), // %1 2007 "+r"(width) // %2 2008 : 2009 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2010 ); 1797 asm volatile( 1798 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1799 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1800 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1801 "movi v7.8b, #16 \n" // Add 16 constant 1802 "1: \n" 1803 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1804 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1805 ARGB1555TOARGB 1806 "umull v3.8h, v0.8b, v4.8b \n" // B 1807 "umlal v3.8h, v1.8b, v5.8b \n" // G 1808 "umlal v3.8h, v2.8b, v6.8b \n" // R 1809 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1810 "uqadd v0.8b, v0.8b, v7.8b \n" 1811 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1812 "b.gt 1b \n" 1813 : "+r"(src_argb1555), // %0 1814 "+r"(dst_y), // %1 1815 "+r"(width) // %2 1816 : 1817 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2011 1818 } 2012 1819 2013 1820 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { 2014 asm volatile ( 2015 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2016 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2017 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2018 "movi v27.8b, #16 \n" // Add 16 constant 2019 "1: \n" 2020 MEMACCESS(0) 2021 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2022 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2023 ARGB4444TOARGB 2024 "umull v3.8h, v0.8b, v24.8b \n" // B 2025 "umlal v3.8h, v1.8b, v25.8b \n" // G 2026 "umlal v3.8h, v2.8b, v26.8b \n" // R 2027 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2028 "uqadd v0.8b, v0.8b, v27.8b \n" 2029 MEMACCESS(1) 2030 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2031 "b.gt 1b \n" 2032 : "+r"(src_argb4444), // %0 2033 "+r"(dst_y), // %1 2034 "+r"(width) // %2 2035 : 2036 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2037 ); 1821 asm volatile( 1822 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1823 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1824 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1825 "movi v27.8b, #16 \n" // Add 16 constant 1826 "1: \n" 1827 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1828 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1829 ARGB4444TOARGB 1830 "umull v3.8h, v0.8b, v24.8b \n" // B 1831 "umlal v3.8h, v1.8b, v25.8b \n" // G 1832 "umlal v3.8h, v2.8b, v26.8b \n" // R 1833 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1834 "uqadd v0.8b, v0.8b, v27.8b \n" 1835 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1836 "b.gt 1b \n" 1837 : "+r"(src_argb4444), // %0 1838 "+r"(dst_y), // %1 1839 "+r"(width) // %2 1840 : 1841 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); 2038 1842 } 2039 1843 2040 1844 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { 2041 asm volatile ( 2042 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2043 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2044 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2045 "movi v7.8b, #16 \n" // Add 16 constant 2046 "1: \n" 2047 MEMACCESS(0) 2048 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2049 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2050 "umull v16.8h, v1.8b, v4.8b \n" // R 2051 "umlal v16.8h, v2.8b, v5.8b \n" // G 2052 "umlal v16.8h, v3.8b, v6.8b \n" // B 2053 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2054 "uqadd v0.8b, v0.8b, v7.8b \n" 2055 MEMACCESS(1) 2056 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2057 "b.gt 1b \n" 2058 : "+r"(src_bgra), // %0 2059 "+r"(dst_y), // %1 2060 "+r"(width) // %2 2061 : 2062 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2063 ); 1845 asm volatile( 1846 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 1847 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1848 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1849 "movi v7.8b, #16 \n" // Add 16 constant 1850 "1: \n" 1851 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1852 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1853 "umull v16.8h, v1.8b, v4.8b \n" // R 1854 "umlal v16.8h, v2.8b, v5.8b \n" // G 1855 "umlal v16.8h, v3.8b, v6.8b \n" // B 1856 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 1857 "uqadd v0.8b, v0.8b, v7.8b \n" 1858 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1859 "b.gt 1b \n" 1860 : "+r"(src_bgra), // %0 1861 "+r"(dst_y), // %1 1862 "+r"(width) // %2 1863 : 1864 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 2064 1865 } 2065 1866 2066 1867 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { 2067 asm volatile ( 2068 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2069 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2070 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2071 "movi v7.8b, #16 \n" // Add 16 constant 2072 "1: \n" 2073 MEMACCESS(0) 2074 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2075 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2076 "umull v16.8h, v0.8b, v4.8b \n" // R 2077 "umlal v16.8h, v1.8b, v5.8b \n" // G 2078 "umlal v16.8h, v2.8b, v6.8b \n" // B 2079 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2080 "uqadd v0.8b, v0.8b, v7.8b \n" 2081 MEMACCESS(1) 2082 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2083 "b.gt 1b \n" 2084 : "+r"(src_abgr), // %0 2085 "+r"(dst_y), // %1 2086 "+r"(width) // %2 2087 : 2088 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2089 ); 1868 asm volatile( 1869 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 1870 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1871 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1872 "movi v7.8b, #16 \n" // Add 16 constant 1873 "1: \n" 1874 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1875 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1876 "umull v16.8h, v0.8b, v4.8b \n" // R 1877 "umlal v16.8h, v1.8b, v5.8b \n" // G 1878 "umlal v16.8h, v2.8b, v6.8b \n" // B 1879 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 1880 "uqadd v0.8b, v0.8b, v7.8b \n" 1881 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1882 "b.gt 1b \n" 1883 : "+r"(src_abgr), // %0 1884 "+r"(dst_y), // %1 1885 "+r"(width) // %2 1886 : 1887 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 2090 1888 } 2091 1889 2092 1890 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { 2093 asm volatile ( 2094 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2095 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2096 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2097 "movi v7.8b, #16 \n" // Add 16 constant 2098 "1: \n" 2099 MEMACCESS(0) 2100 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2101 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2102 "umull v16.8h, v1.8b, v4.8b \n" // B 2103 "umlal v16.8h, v2.8b, v5.8b \n" // G 2104 "umlal v16.8h, v3.8b, v6.8b \n" // R 2105 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2106 "uqadd v0.8b, v0.8b, v7.8b \n" 2107 MEMACCESS(1) 2108 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2109 "b.gt 1b \n" 2110 : "+r"(src_rgba), // %0 2111 "+r"(dst_y), // %1 2112 "+r"(width) // %2 2113 : 2114 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2115 ); 1891 asm volatile( 1892 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1893 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1894 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1895 "movi v7.8b, #16 \n" // Add 16 constant 1896 "1: \n" 1897 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 1898 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1899 "umull v16.8h, v1.8b, v4.8b \n" // B 1900 "umlal v16.8h, v2.8b, v5.8b \n" // G 1901 "umlal v16.8h, v3.8b, v6.8b \n" // R 1902 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 1903 "uqadd v0.8b, v0.8b, v7.8b \n" 1904 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1905 "b.gt 1b \n" 1906 : "+r"(src_rgba), // %0 1907 "+r"(dst_y), // %1 1908 "+r"(width) // %2 1909 : 1910 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 2116 1911 } 2117 1912 2118 1913 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { 2119 asm volatile ( 2120 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2121 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2122 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2123 "movi v7.8b, #16 \n" // Add 16 constant 2124 "1: \n" 2125 MEMACCESS(0) 2126 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2127 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2128 "umull v16.8h, v0.8b, v4.8b \n" // B 2129 "umlal v16.8h, v1.8b, v5.8b \n" // G 2130 "umlal v16.8h, v2.8b, v6.8b \n" // R 2131 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2132 "uqadd v0.8b, v0.8b, v7.8b \n" 2133 MEMACCESS(1) 2134 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2135 "b.gt 1b \n" 2136 : "+r"(src_rgb24), // %0 2137 "+r"(dst_y), // %1 2138 "+r"(width) // %2 2139 : 2140 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2141 ); 1914 asm volatile( 1915 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1916 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1917 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1918 "movi v7.8b, #16 \n" // Add 16 constant 1919 "1: \n" 1920 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1921 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1922 "umull v16.8h, v0.8b, v4.8b \n" // B 1923 "umlal v16.8h, v1.8b, v5.8b \n" // G 1924 "umlal v16.8h, v2.8b, v6.8b \n" // R 1925 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 1926 "uqadd v0.8b, v0.8b, v7.8b \n" 1927 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1928 "b.gt 1b \n" 1929 : "+r"(src_rgb24), // %0 1930 "+r"(dst_y), // %1 1931 "+r"(width) // %2 1932 : 1933 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 2142 1934 } 2143 1935 2144 1936 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { 2145 asm volatile ( 2146 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2147 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2148 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2149 "movi v7.8b, #16 \n" // Add 16 constant 2150 "1: \n" 2151 MEMACCESS(0) 2152 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2153 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2154 "umull v16.8h, v0.8b, v4.8b \n" // B 2155 "umlal v16.8h, v1.8b, v5.8b \n" // G 2156 "umlal v16.8h, v2.8b, v6.8b \n" // R 2157 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2158 "uqadd v0.8b, v0.8b, v7.8b \n" 2159 MEMACCESS(1) 2160 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2161 "b.gt 1b \n" 2162 : "+r"(src_raw), // %0 2163 "+r"(dst_y), // %1 2164 "+r"(width) // %2 2165 : 2166 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2167 ); 1937 asm volatile( 1938 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 1939 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1940 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 1941 "movi v7.8b, #16 \n" // Add 16 constant 1942 "1: \n" 1943 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 1944 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1945 "umull v16.8h, v0.8b, v4.8b \n" // B 1946 "umlal v16.8h, v1.8b, v5.8b \n" // G 1947 "umlal v16.8h, v2.8b, v6.8b \n" // R 1948 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 1949 "uqadd v0.8b, v0.8b, v7.8b \n" 1950 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1951 "b.gt 1b \n" 1952 : "+r"(src_raw), // %0 1953 "+r"(dst_y), // %1 1954 "+r"(width) // %2 1955 : 1956 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 2168 1957 } 2169 1958 2170 1959 // Bilinear filter 16x2 -> 16x1 2171 1960 void InterpolateRow_NEON(uint8* dst_ptr, 2172 const uint8* src_ptr, ptrdiff_t src_stride, 2173 int dst_width, int source_y_fraction) { 1961 const uint8* src_ptr, 1962 ptrdiff_t src_stride, 1963 int dst_width, 1964 int source_y_fraction) { 2174 1965 int y1_fraction = source_y_fraction; 2175 1966 int y0_fraction = 256 - y1_fraction; 2176 1967 const uint8* src_ptr1 = src_ptr + src_stride; 2177 asm volatile ( 2178 "cmp %w4, #0 \n" 2179 "b.eq 100f \n" 2180 "cmp %w4, #128 \n" 2181 "b.eq 50f \n" 2182 2183 "dup v5.16b, %w4 \n" 2184 "dup v4.16b, %w5 \n" 2185 // General purpose row blend. 2186 "1: \n" 2187 MEMACCESS(1) 2188 "ld1 {v0.16b}, [%1], #16 \n" 2189 MEMACCESS(2) 2190 "ld1 {v1.16b}, [%2], #16 \n" 2191 "subs %w3, %w3, #16 \n" 2192 "umull v2.8h, v0.8b, v4.8b \n" 2193 "umull2 v3.8h, v0.16b, v4.16b \n" 2194 "umlal v2.8h, v1.8b, v5.8b \n" 2195 "umlal2 v3.8h, v1.16b, v5.16b \n" 2196 "rshrn v0.8b, v2.8h, #8 \n" 2197 "rshrn2 v0.16b, v3.8h, #8 \n" 2198 MEMACCESS(0) 2199 "st1 {v0.16b}, [%0], #16 \n" 2200 "b.gt 1b \n" 2201 "b 99f \n" 2202 2203 // Blend 50 / 50. 2204 "50: \n" 2205 MEMACCESS(1) 2206 "ld1 {v0.16b}, [%1], #16 \n" 2207 MEMACCESS(2) 2208 "ld1 {v1.16b}, [%2], #16 \n" 2209 "subs %w3, %w3, #16 \n" 2210 "urhadd v0.16b, v0.16b, v1.16b \n" 2211 MEMACCESS(0) 2212 "st1 {v0.16b}, [%0], #16 \n" 2213 "b.gt 50b \n" 2214 "b 99f \n" 2215 2216 // Blend 100 / 0 - Copy row unchanged. 2217 "100: \n" 2218 MEMACCESS(1) 2219 "ld1 {v0.16b}, [%1], #16 \n" 2220 "subs %w3, %w3, #16 \n" 2221 MEMACCESS(0) 2222 "st1 {v0.16b}, [%0], #16 \n" 2223 "b.gt 100b \n" 2224 2225 "99: \n" 2226 : "+r"(dst_ptr), // %0 2227 "+r"(src_ptr), // %1 2228 "+r"(src_ptr1), // %2 2229 "+r"(dst_width), // %3 2230 "+r"(y1_fraction), // %4 2231 "+r"(y0_fraction) // %5 2232 : 2233 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2234 ); 1968 asm volatile( 1969 "cmp %w4, #0 \n" 1970 "b.eq 100f \n" 1971 "cmp %w4, #128 \n" 1972 "b.eq 50f \n" 1973 1974 "dup v5.16b, %w4 \n" 1975 "dup v4.16b, %w5 \n" 1976 // General purpose row blend. 1977 "1: \n" 1978 "ld1 {v0.16b}, [%1], #16 \n" 1979 "ld1 {v1.16b}, [%2], #16 \n" 1980 "subs %w3, %w3, #16 \n" 1981 "umull v2.8h, v0.8b, v4.8b \n" 1982 "umull2 v3.8h, v0.16b, v4.16b \n" 1983 "umlal v2.8h, v1.8b, v5.8b \n" 1984 "umlal2 v3.8h, v1.16b, v5.16b \n" 1985 "rshrn v0.8b, v2.8h, #8 \n" 1986 "rshrn2 v0.16b, v3.8h, #8 \n" 1987 "st1 {v0.16b}, [%0], #16 \n" 1988 "b.gt 1b \n" 1989 "b 99f \n" 1990 1991 // Blend 50 / 50. 1992 "50: \n" 1993 "ld1 {v0.16b}, [%1], #16 \n" 1994 "ld1 {v1.16b}, [%2], #16 \n" 1995 "subs %w3, %w3, #16 \n" 1996 "urhadd v0.16b, v0.16b, v1.16b \n" 1997 "st1 {v0.16b}, [%0], #16 \n" 1998 "b.gt 50b \n" 1999 "b 99f \n" 2000 2001 // Blend 100 / 0 - Copy row unchanged. 2002 "100: \n" 2003 "ld1 {v0.16b}, [%1], #16 \n" 2004 "subs %w3, %w3, #16 \n" 2005 "st1 {v0.16b}, [%0], #16 \n" 2006 "b.gt 100b \n" 2007 2008 "99: \n" 2009 : "+r"(dst_ptr), // %0 2010 "+r"(src_ptr), // %1 2011 "+r"(src_ptr1), // %2 2012 "+r"(dst_width), // %3 2013 "+r"(y1_fraction), // %4 2014 "+r"(y0_fraction) // %5 2015 : 2016 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); 2235 2017 } 2236 2018 2237 2019 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2238 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2239 uint8* dst_argb, int width) { 2240 asm volatile ( 2241 "subs %w3, %w3, #8 \n" 2242 "b.lt 89f \n" 2243 // Blend 8 pixels. 2244 "8: \n" 2245 MEMACCESS(0) 2246 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2247 MEMACCESS(1) 2248 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels 2249 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2250 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2251 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2252 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2253 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2254 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2255 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2256 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2257 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2258 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2259 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2260 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2261 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2262 "movi v3.8b, #255 \n" // a = 255 2263 MEMACCESS(2) 2264 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2265 "b.ge 8b \n" 2266 2267 "89: \n" 2268 "adds %w3, %w3, #8-1 \n" 2269 "b.lt 99f \n" 2270 2271 // Blend 1 pixels. 2272 "1: \n" 2273 MEMACCESS(0) 2274 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2275 MEMACCESS(1) 2276 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2277 "subs %w3, %w3, #1 \n" // 1 processed per loop. 2278 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2279 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2280 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2281 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2282 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2283 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2284 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2285 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2286 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2287 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2288 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2289 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2290 "movi v3.8b, #255 \n" // a = 255 2291 MEMACCESS(2) 2292 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. 2293 "b.ge 1b \n" 2294 2295 "99: \n" 2296 2297 : "+r"(src_argb0), // %0 2298 "+r"(src_argb1), // %1 2299 "+r"(dst_argb), // %2 2300 "+r"(width) // %3 2301 : 2302 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2303 "v16", "v17", "v18" 2304 ); 2020 void ARGBBlendRow_NEON(const uint8* src_argb0, 2021 const uint8* src_argb1, 2022 uint8* dst_argb, 2023 int width) { 2024 asm volatile( 2025 "subs %w3, %w3, #8 \n" 2026 "b.lt 89f \n" 2027 // Blend 8 pixels. 2028 "8: \n" 2029 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 2030 // pixels 2031 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 2032 // pixels 2033 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2034 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2035 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2036 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2037 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2038 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2039 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2040 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2041 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2042 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2043 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2044 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2045 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2046 "movi v3.8b, #255 \n" // a = 255 2047 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2048 // pixels 2049 "b.ge 8b \n" 2050 2051 "89: \n" 2052 "adds %w3, %w3, #8-1 \n" 2053 "b.lt 99f \n" 2054 2055 // Blend 1 pixels. 2056 "1: \n" 2057 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2058 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2059 "subs %w3, %w3, #1 \n" // 1 processed per loop. 2060 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2061 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2062 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2063 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2064 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2065 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2066 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2067 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2068 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2069 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2070 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2071 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2072 "movi v3.8b, #255 \n" // a = 255 2073 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. 2074 "b.ge 1b \n" 2075 2076 "99: \n" 2077 2078 : "+r"(src_argb0), // %0 2079 "+r"(src_argb1), // %1 2080 "+r"(dst_argb), // %2 2081 "+r"(width) // %3 2082 : 2083 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 2084 "v17", "v18"); 2305 2085 } 2306 2086 2307 2087 // Attenuate 8 pixels at a time. 2308 2088 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2309 asm volatile ( 2310 // Attenuate 8 pixels. 2311 "1: \n" 2312 MEMACCESS(0) 2313 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2314 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2315 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2316 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2317 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2318 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2319 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2320 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2321 MEMACCESS(1) 2322 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2323 "b.gt 1b \n" 2324 : "+r"(src_argb), // %0 2325 "+r"(dst_argb), // %1 2326 "+r"(width) // %2 2327 : 2328 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2329 ); 2089 asm volatile( 2090 // Attenuate 8 pixels. 2091 "1: \n" 2092 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2093 // pixels 2094 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2095 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2096 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2097 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2098 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2099 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2100 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2101 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB 2102 // pixels 2103 "b.gt 1b \n" 2104 : "+r"(src_argb), // %0 2105 "+r"(dst_argb), // %1 2106 "+r"(width) // %2 2107 : 2108 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); 2330 2109 } 2331 2110 2332 2111 // Quantize 8 ARGB pixels (32 bytes). 2333 2112 // dst = (dst * scale >> 16) * interval_size + interval_offset; 2334 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2335 int interval_offset, int width) { 2336 asm volatile ( 2337 "dup v4.8h, %w2 \n" 2338 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2339 "dup v5.8h, %w3 \n" // interval multiply. 2340 "dup v6.8h, %w4 \n" // interval add 2341 2342 // 8 pixel loop. 2343 "1: \n" 2344 MEMACCESS(0) 2345 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. 2346 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2347 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2348 "uxtl v1.8h, v1.8b \n" 2349 "uxtl v2.8h, v2.8b \n" 2350 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2351 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2352 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2353 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2354 "mul v1.8h, v1.8h, v5.8h \n" // g 2355 "mul v2.8h, v2.8h, v5.8h \n" // r 2356 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2357 "add v1.8h, v1.8h, v6.8h \n" // g 2358 "add v2.8h, v2.8h, v6.8h \n" // r 2359 "uqxtn v0.8b, v0.8h \n" 2360 "uqxtn v1.8b, v1.8h \n" 2361 "uqxtn v2.8b, v2.8h \n" 2362 MEMACCESS(0) 2363 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels 2364 "b.gt 1b \n" 2365 : "+r"(dst_argb), // %0 2366 "+r"(width) // %1 2367 : "r"(scale), // %2 2368 "r"(interval_size), // %3 2369 "r"(interval_offset) // %4 2370 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2371 ); 2113 void ARGBQuantizeRow_NEON(uint8* dst_argb, 2114 int scale, 2115 int interval_size, 2116 int interval_offset, 2117 int width) { 2118 asm volatile( 2119 "dup v4.8h, %w2 \n" 2120 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2121 "dup v5.8h, %w3 \n" // interval multiply. 2122 "dup v6.8h, %w4 \n" // interval add 2123 2124 // 8 pixel loop. 2125 "1: \n" 2126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of 2127 // ARGB. 2128 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2129 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2130 "uxtl v1.8h, v1.8b \n" 2131 "uxtl v2.8h, v2.8b \n" 2132 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2133 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2134 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2135 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2136 "mul v1.8h, v1.8h, v5.8h \n" // g 2137 "mul v2.8h, v2.8h, v5.8h \n" // r 2138 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2139 "add v1.8h, v1.8h, v6.8h \n" // g 2140 "add v2.8h, v2.8h, v6.8h \n" // r 2141 "uqxtn v0.8b, v0.8h \n" 2142 "uqxtn v1.8b, v1.8h \n" 2143 "uqxtn v2.8b, v2.8h \n" 2144 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB 2145 // pixels 2146 "b.gt 1b \n" 2147 : "+r"(dst_argb), // %0 2148 "+r"(width) // %1 2149 : "r"(scale), // %2 2150 "r"(interval_size), // %3 2151 "r"(interval_offset) // %4 2152 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); 2372 2153 } 2373 2154 … … 2375 2156 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2376 2157 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2377 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2158 void ARGBShadeRow_NEON(const uint8* src_argb, 2159 uint8* dst_argb, 2160 int width, 2378 2161 uint32 value) { 2379 asm volatile ( 2380 "dup v0.4s, %w3 \n" // duplicate scale value. 2381 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2382 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2383 2384 // 8 pixel loop. 2385 "1: \n" 2386 MEMACCESS(0) 2387 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2388 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2389 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2390 "uxtl v5.8h, v5.8b \n" 2391 "uxtl v6.8h, v6.8b \n" 2392 "uxtl v7.8h, v7.8b \n" 2393 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2394 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2395 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2396 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2397 "uqxtn v4.8b, v4.8h \n" 2398 "uqxtn v5.8b, v5.8h \n" 2399 "uqxtn v6.8b, v6.8h \n" 2400 "uqxtn v7.8b, v7.8h \n" 2401 MEMACCESS(1) 2402 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels 2403 "b.gt 1b \n" 2404 : "+r"(src_argb), // %0 2405 "+r"(dst_argb), // %1 2406 "+r"(width) // %2 2407 : "r"(value) // %3 2408 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" 2409 ); 2162 asm volatile( 2163 "dup v0.4s, %w3 \n" // duplicate scale value. 2164 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2165 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2166 2167 // 8 pixel loop. 2168 "1: \n" 2169 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB 2170 // pixels. 2171 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2172 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2173 "uxtl v5.8h, v5.8b \n" 2174 "uxtl v6.8h, v6.8b \n" 2175 "uxtl v7.8h, v7.8b \n" 2176 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2177 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2178 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2179 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2180 "uqxtn v4.8b, v4.8h \n" 2181 "uqxtn v5.8b, v5.8h \n" 2182 "uqxtn v6.8b, v6.8h \n" 2183 "uqxtn v7.8b, v7.8h \n" 2184 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB 2185 // pixels 2186 "b.gt 1b \n" 2187 : "+r"(src_argb), // %0 2188 "+r"(dst_argb), // %1 2189 "+r"(width) // %2 2190 : "r"(value) // %3 2191 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); 2410 2192 } 2411 2193 … … 2414 2196 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2415 2197 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2416 asm volatile ( 2417 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2418 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2419 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2420 "1: \n" 2421 MEMACCESS(0) 2422 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2423 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2424 "umull v4.8h, v0.8b, v24.8b \n" // B 2425 "umlal v4.8h, v1.8b, v25.8b \n" // G 2426 "umlal v4.8h, v2.8b, v26.8b \n" // R 2427 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2428 "orr v1.8b, v0.8b, v0.8b \n" // G 2429 "orr v2.8b, v0.8b, v0.8b \n" // R 2430 MEMACCESS(1) 2431 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2432 "b.gt 1b \n" 2433 : "+r"(src_argb), // %0 2434 "+r"(dst_argb), // %1 2435 "+r"(width) // %2 2436 : 2437 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" 2438 ); 2198 asm volatile( 2199 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2200 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2201 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2202 "1: \n" 2203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2204 // pixels. 2205 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2206 "umull v4.8h, v0.8b, v24.8b \n" // B 2207 "umlal v4.8h, v1.8b, v25.8b \n" // G 2208 "umlal v4.8h, v2.8b, v26.8b \n" // R 2209 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2210 "orr v1.8b, v0.8b, v0.8b \n" // G 2211 "orr v2.8b, v0.8b, v0.8b \n" // R 2212 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2213 "b.gt 1b \n" 2214 : "+r"(src_argb), // %0 2215 "+r"(dst_argb), // %1 2216 "+r"(width) // %2 2217 : 2218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); 2439 2219 } 2440 2220 … … 2445 2225 2446 2226 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2447 asm volatile ( 2448 "movi v20.8b, #17 \n" // BB coefficient 2449 "movi v21.8b, #68 \n" // BG coefficient 2450 "movi v22.8b, #35 \n" // BR coefficient 2451 "movi v24.8b, #22 \n" // GB coefficient 2452 "movi v25.8b, #88 \n" // GG coefficient 2453 "movi v26.8b, #45 \n" // GR coefficient 2454 "movi v28.8b, #24 \n" // BB coefficient 2455 "movi v29.8b, #98 \n" // BG coefficient 2456 "movi v30.8b, #50 \n" // BR coefficient 2457 "1: \n" 2458 MEMACCESS(0) 2459 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2460 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2461 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2462 "umlal v4.8h, v1.8b, v21.8b \n" // G 2463 "umlal v4.8h, v2.8b, v22.8b \n" // R 2464 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2465 "umlal v5.8h, v1.8b, v25.8b \n" // G 2466 "umlal v5.8h, v2.8b, v26.8b \n" // R 2467 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2468 "umlal v6.8h, v1.8b, v29.8b \n" // G 2469 "umlal v6.8h, v2.8b, v30.8b \n" // R 2470 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2471 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G 2472 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R 2473 MEMACCESS(0) 2474 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2475 "b.gt 1b \n" 2476 : "+r"(dst_argb), // %0 2477 "+r"(width) // %1 2478 : 2479 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2480 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" 2481 ); 2227 asm volatile( 2228 "movi v20.8b, #17 \n" // BB coefficient 2229 "movi v21.8b, #68 \n" // BG coefficient 2230 "movi v22.8b, #35 \n" // BR coefficient 2231 "movi v24.8b, #22 \n" // GB coefficient 2232 "movi v25.8b, #88 \n" // GG coefficient 2233 "movi v26.8b, #45 \n" // GR coefficient 2234 "movi v28.8b, #24 \n" // BB coefficient 2235 "movi v29.8b, #98 \n" // BG coefficient 2236 "movi v30.8b, #50 \n" // BR coefficient 2237 "1: \n" 2238 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2239 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2240 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2241 "umlal v4.8h, v1.8b, v21.8b \n" // G 2242 "umlal v4.8h, v2.8b, v22.8b \n" // R 2243 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2244 "umlal v5.8h, v1.8b, v25.8b \n" // G 2245 "umlal v5.8h, v2.8b, v26.8b \n" // R 2246 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2247 "umlal v6.8h, v1.8b, v29.8b \n" // G 2248 "umlal v6.8h, v2.8b, v30.8b \n" // R 2249 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2250 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G 2251 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R 2252 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2253 "b.gt 1b \n" 2254 : "+r"(dst_argb), // %0 2255 "+r"(width) // %1 2256 : 2257 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 2258 "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); 2482 2259 } 2483 2260 … … 2485 2262 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2486 2263 // needs to saturate. Consider doing a non-saturating version. 2487 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,2488 const int8* matrix_argb, int width) {2489 asm volatile (2490 MEMACCESS(3)2491 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.2492 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.2493 "sxtl2 v1.8h, v2.16b \n" // R,Acoefficients s16.2494 2495 "1: \n" 2496 MEMACCESS(0)2497 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.2498 "subs %w2, %w2, #8 \n" // 8 processed per loop.2499 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit2500 "uxtl v17.8h, v17.8b \n" // g2501 "uxtl v18.8h, v18.8b \n" // r2502 "uxtl v19.8h, v19.8b \n" // a2503 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B2504 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G2505 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R2506 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A2507 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B2508 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G2509 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R2510 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A2511 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B2512 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G2513 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R2514 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A2515 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B2516 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G2517 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R2518 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A2519 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B2520 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G2521 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R2522 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A2523 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B2524 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G2525 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R2526 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A2527 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B2528 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G2529 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R2530 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A2531 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B2532 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G2533 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R2534 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A2535 MEMACCESS(1)2536 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.2537 "b.gt 1b \n"2538 : "+r"(src_argb), // %02539 "+r"(dst_argb), // %12540 "+r"(width) // %22541 : "r"(matrix_argb) // %32542 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",2543 "v18", "v19", "v22", "v23", "v24", "v25"2544 );2264 void ARGBColorMatrixRow_NEON(const uint8* src_argb, 2265 uint8* dst_argb, 2266 const int8* matrix_argb, 2267 int width) { 2268 asm volatile( 2269 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2270 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2271 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2272 2273 "1: \n" 2274 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 2275 // pixels. 2276 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2277 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit 2278 "uxtl v17.8h, v17.8b \n" // g 2279 "uxtl v18.8h, v18.8b \n" // r 2280 "uxtl v19.8h, v19.8b \n" // a 2281 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B 2282 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G 2283 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R 2284 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A 2285 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B 2286 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G 2287 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R 2288 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A 2289 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2290 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2291 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2292 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2293 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B 2294 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G 2295 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R 2296 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A 2297 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2298 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2299 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2300 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2301 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B 2302 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G 2303 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R 2304 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A 2305 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2306 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2307 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2308 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2309 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B 2310 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G 2311 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2312 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2313 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 2314 // pixels. 2315 "b.gt 1b \n" 2316 : "+r"(src_argb), // %0 2317 "+r"(dst_argb), // %1 2318 "+r"(width) // %2 2319 : "r"(matrix_argb) // %3 2320 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 2321 "v17", "v18", "v19", "v22", "v23", "v24", "v25"); 2545 2322 } 2546 2323 2547 2324 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2548 2325 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2549 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2550 uint8* dst_argb, int width) { 2551 asm volatile ( 2552 // 8 pixel loop. 2553 "1: \n" 2554 MEMACCESS(0) 2555 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2556 MEMACCESS(1) 2557 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2558 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2559 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2560 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2561 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2562 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2563 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2564 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2565 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2566 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2567 MEMACCESS(2) 2568 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2569 "b.gt 1b \n" 2570 2571 : "+r"(src_argb0), // %0 2572 "+r"(src_argb1), // %1 2573 "+r"(dst_argb), // %2 2574 "+r"(width) // %3 2575 : 2576 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2577 ); 2326 void ARGBMultiplyRow_NEON(const uint8* src_argb0, 2327 const uint8* src_argb1, 2328 uint8* dst_argb, 2329 int width) { 2330 asm volatile( 2331 // 8 pixel loop. 2332 "1: \n" 2333 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2334 // pixels. 2335 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2336 // pixels. 2337 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2338 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2339 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2340 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2341 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2342 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2343 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2344 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2345 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2346 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2347 // pixels 2348 "b.gt 1b \n" 2349 2350 : "+r"(src_argb0), // %0 2351 "+r"(src_argb1), // %1 2352 "+r"(dst_argb), // %2 2353 "+r"(width) // %3 2354 : 2355 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2578 2356 } 2579 2357 2580 2358 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 2581 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2582 uint8* dst_argb, int width) { 2583 asm volatile ( 2584 // 8 pixel loop. 2585 "1: \n" 2586 MEMACCESS(0) 2587 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2588 MEMACCESS(1) 2589 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2590 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2591 "uqadd v0.8b, v0.8b, v4.8b \n" 2592 "uqadd v1.8b, v1.8b, v5.8b \n" 2593 "uqadd v2.8b, v2.8b, v6.8b \n" 2594 "uqadd v3.8b, v3.8b, v7.8b \n" 2595 MEMACCESS(2) 2596 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2597 "b.gt 1b \n" 2598 2599 : "+r"(src_argb0), // %0 2600 "+r"(src_argb1), // %1 2601 "+r"(dst_argb), // %2 2602 "+r"(width) // %3 2603 : 2604 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2605 ); 2359 void ARGBAddRow_NEON(const uint8* src_argb0, 2360 const uint8* src_argb1, 2361 uint8* dst_argb, 2362 int width) { 2363 asm volatile( 2364 // 8 pixel loop. 2365 "1: \n" 2366 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2367 // pixels. 2368 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2369 // pixels. 2370 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2371 "uqadd v0.8b, v0.8b, v4.8b \n" 2372 "uqadd v1.8b, v1.8b, v5.8b \n" 2373 "uqadd v2.8b, v2.8b, v6.8b \n" 2374 "uqadd v3.8b, v3.8b, v7.8b \n" 2375 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2376 // pixels 2377 "b.gt 1b \n" 2378 2379 : "+r"(src_argb0), // %0 2380 "+r"(src_argb1), // %1 2381 "+r"(dst_argb), // %2 2382 "+r"(width) // %3 2383 : 2384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2606 2385 } 2607 2386 2608 2387 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2609 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2610 uint8* dst_argb, int width) { 2611 asm volatile ( 2612 // 8 pixel loop. 2613 "1: \n" 2614 MEMACCESS(0) 2615 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2616 MEMACCESS(1) 2617 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2618 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2619 "uqsub v0.8b, v0.8b, v4.8b \n" 2620 "uqsub v1.8b, v1.8b, v5.8b \n" 2621 "uqsub v2.8b, v2.8b, v6.8b \n" 2622 "uqsub v3.8b, v3.8b, v7.8b \n" 2623 MEMACCESS(2) 2624 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2625 "b.gt 1b \n" 2626 2627 : "+r"(src_argb0), // %0 2628 "+r"(src_argb1), // %1 2629 "+r"(dst_argb), // %2 2630 "+r"(width) // %3 2631 : 2632 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2633 ); 2388 void ARGBSubtractRow_NEON(const uint8* src_argb0, 2389 const uint8* src_argb1, 2390 uint8* dst_argb, 2391 int width) { 2392 asm volatile( 2393 // 8 pixel loop. 2394 "1: \n" 2395 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB 2396 // pixels. 2397 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more 2398 // pixels. 2399 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2400 "uqsub v0.8b, v0.8b, v4.8b \n" 2401 "uqsub v1.8b, v1.8b, v5.8b \n" 2402 "uqsub v2.8b, v2.8b, v6.8b \n" 2403 "uqsub v3.8b, v3.8b, v7.8b \n" 2404 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2405 // pixels 2406 "b.gt 1b \n" 2407 2408 : "+r"(src_argb0), // %0 2409 "+r"(src_argb1), // %1 2410 "+r"(dst_argb), // %2 2411 "+r"(width) // %3 2412 : 2413 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); 2634 2414 } 2635 2415 … … 2639 2419 // G = Sobel 2640 2420 // B = Sobel 2641 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2642 uint8* dst_argb, int width) { 2643 asm volatile ( 2644 "movi v3.8b, #255 \n" // alpha 2645 // 8 pixel loop. 2646 "1: \n" 2647 MEMACCESS(0) 2648 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2649 MEMACCESS(1) 2650 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2651 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2652 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2653 "orr v1.8b, v0.8b, v0.8b \n" 2654 "orr v2.8b, v0.8b, v0.8b \n" 2655 MEMACCESS(2) 2656 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2657 "b.gt 1b \n" 2658 : "+r"(src_sobelx), // %0 2659 "+r"(src_sobely), // %1 2660 "+r"(dst_argb), // %2 2661 "+r"(width) // %3 2662 : 2663 : "cc", "memory", "v0", "v1", "v2", "v3" 2664 ); 2421 void SobelRow_NEON(const uint8* src_sobelx, 2422 const uint8* src_sobely, 2423 uint8* dst_argb, 2424 int width) { 2425 asm volatile( 2426 "movi v3.8b, #255 \n" // alpha 2427 // 8 pixel loop. 2428 "1: \n" 2429 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2430 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2431 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2432 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2433 "orr v1.8b, v0.8b, v0.8b \n" 2434 "orr v2.8b, v0.8b, v0.8b \n" 2435 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2436 // pixels 2437 "b.gt 1b \n" 2438 : "+r"(src_sobelx), // %0 2439 "+r"(src_sobely), // %1 2440 "+r"(dst_argb), // %2 2441 "+r"(width) // %3 2442 : 2443 : "cc", "memory", "v0", "v1", "v2", "v3"); 2665 2444 } 2666 2445 2667 2446 // Adds Sobel X and Sobel Y and stores Sobel into plane. 2668 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2669 uint8* dst_y, int width) { 2670 asm volatile ( 2671 // 16 pixel loop. 2672 "1: \n" 2673 MEMACCESS(0) 2674 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2675 MEMACCESS(1) 2676 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2677 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2678 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2679 MEMACCESS(2) 2680 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2681 "b.gt 1b \n" 2682 : "+r"(src_sobelx), // %0 2683 "+r"(src_sobely), // %1 2684 "+r"(dst_y), // %2 2685 "+r"(width) // %3 2686 : 2687 : "cc", "memory", "v0", "v1" 2688 ); 2447 void SobelToPlaneRow_NEON(const uint8* src_sobelx, 2448 const uint8* src_sobely, 2449 uint8* dst_y, 2450 int width) { 2451 asm volatile( 2452 // 16 pixel loop. 2453 "1: \n" 2454 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2455 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2456 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2457 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2458 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2459 "b.gt 1b \n" 2460 : "+r"(src_sobelx), // %0 2461 "+r"(src_sobely), // %1 2462 "+r"(dst_y), // %2 2463 "+r"(width) // %3 2464 : 2465 : "cc", "memory", "v0", "v1"); 2689 2466 } 2690 2467 … … 2694 2471 // G = Sobel 2695 2472 // B = Sobel Y 2696 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2697 uint8* dst_argb, int width) { 2698 asm volatile ( 2699 "movi v3.8b, #255 \n" // alpha 2700 // 8 pixel loop. 2701 "1: \n" 2702 MEMACCESS(0) 2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2704 MEMACCESS(1) 2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2706 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2708 MEMACCESS(2) 2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2710 "b.gt 1b \n" 2711 : "+r"(src_sobelx), // %0 2712 "+r"(src_sobely), // %1 2713 "+r"(dst_argb), // %2 2714 "+r"(width) // %3 2715 : 2716 : "cc", "memory", "v0", "v1", "v2", "v3" 2717 ); 2473 void SobelXYRow_NEON(const uint8* src_sobelx, 2474 const uint8* src_sobely, 2475 uint8* dst_argb, 2476 int width) { 2477 asm volatile( 2478 "movi v3.8b, #255 \n" // alpha 2479 // 8 pixel loop. 2480 "1: \n" 2481 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2482 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2483 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2484 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2485 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB 2486 // pixels 2487 "b.gt 1b \n" 2488 : "+r"(src_sobelx), // %0 2489 "+r"(src_sobely), // %1 2490 "+r"(dst_argb), // %2 2491 "+r"(width) // %3 2492 : 2493 : "cc", "memory", "v0", "v1", "v2", "v3"); 2718 2494 } 2719 2495 … … 2722 2498 // -2 0 2 2723 2499 // -1 0 1 2724 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 2725 const uint8* src_y2, uint8* dst_sobelx, int width) { 2726 asm volatile ( 2727 "1: \n" 2728 MEMACCESS(0) 2729 "ld1 {v0.8b}, [%0],%5 \n" // top 2730 MEMACCESS(0) 2731 "ld1 {v1.8b}, [%0],%6 \n" 2732 "usubl v0.8h, v0.8b, v1.8b \n" 2733 MEMACCESS(1) 2734 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 2735 MEMACCESS(1) 2736 "ld1 {v3.8b}, [%1],%6 \n" 2737 "usubl v1.8h, v2.8b, v3.8b \n" 2738 "add v0.8h, v0.8h, v1.8h \n" 2739 "add v0.8h, v0.8h, v1.8h \n" 2740 MEMACCESS(2) 2741 "ld1 {v2.8b}, [%2],%5 \n" // bottom 2742 MEMACCESS(2) 2743 "ld1 {v3.8b}, [%2],%6 \n" 2744 "subs %w4, %w4, #8 \n" // 8 pixels 2745 "usubl v1.8h, v2.8b, v3.8b \n" 2746 "add v0.8h, v0.8h, v1.8h \n" 2747 "abs v0.8h, v0.8h \n" 2748 "uqxtn v0.8b, v0.8h \n" 2749 MEMACCESS(3) 2750 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 2751 "b.gt 1b \n" 2752 : "+r"(src_y0), // %0 2753 "+r"(src_y1), // %1 2754 "+r"(src_y2), // %2 2755 "+r"(dst_sobelx), // %3 2756 "+r"(width) // %4 2757 : "r"(2LL), // %5 2758 "r"(6LL) // %6 2759 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2760 ); 2500 void SobelXRow_NEON(const uint8* src_y0, 2501 const uint8* src_y1, 2502 const uint8* src_y2, 2503 uint8* dst_sobelx, 2504 int width) { 2505 asm volatile( 2506 "1: \n" 2507 "ld1 {v0.8b}, [%0],%5 \n" // top 2508 "ld1 {v1.8b}, [%0],%6 \n" 2509 "usubl v0.8h, v0.8b, v1.8b \n" 2510 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 2511 "ld1 {v3.8b}, [%1],%6 \n" 2512 "usubl v1.8h, v2.8b, v3.8b \n" 2513 "add v0.8h, v0.8h, v1.8h \n" 2514 "add v0.8h, v0.8h, v1.8h \n" 2515 "ld1 {v2.8b}, [%2],%5 \n" // bottom 2516 "ld1 {v3.8b}, [%2],%6 \n" 2517 "subs %w4, %w4, #8 \n" // 8 pixels 2518 "usubl v1.8h, v2.8b, v3.8b \n" 2519 "add v0.8h, v0.8h, v1.8h \n" 2520 "abs v0.8h, v0.8h \n" 2521 "uqxtn v0.8b, v0.8h \n" 2522 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 2523 "b.gt 1b \n" 2524 : "+r"(src_y0), // %0 2525 "+r"(src_y1), // %1 2526 "+r"(src_y2), // %2 2527 "+r"(dst_sobelx), // %3 2528 "+r"(width) // %4 2529 : "r"(2LL), // %5 2530 "r"(6LL) // %6 2531 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2532 ); 2761 2533 } 2762 2534 … … 2765 2537 // 0 0 0 2766 2538 // 1 2 1 2767 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 2768 uint8* dst_sobely, int width) { 2769 asm volatile ( 2770 "1: \n" 2771 MEMACCESS(0) 2772 "ld1 {v0.8b}, [%0],%4 \n" // left 2773 MEMACCESS(1) 2774 "ld1 {v1.8b}, [%1],%4 \n" 2775 "usubl v0.8h, v0.8b, v1.8b \n" 2776 MEMACCESS(0) 2777 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 2778 MEMACCESS(1) 2779 "ld1 {v3.8b}, [%1],%4 \n" 2780 "usubl v1.8h, v2.8b, v3.8b \n" 2781 "add v0.8h, v0.8h, v1.8h \n" 2782 "add v0.8h, v0.8h, v1.8h \n" 2783 MEMACCESS(0) 2784 "ld1 {v2.8b}, [%0],%5 \n" // right 2785 MEMACCESS(1) 2786 "ld1 {v3.8b}, [%1],%5 \n" 2787 "subs %w3, %w3, #8 \n" // 8 pixels 2788 "usubl v1.8h, v2.8b, v3.8b \n" 2789 "add v0.8h, v0.8h, v1.8h \n" 2790 "abs v0.8h, v0.8h \n" 2791 "uqxtn v0.8b, v0.8h \n" 2792 MEMACCESS(2) 2793 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 2794 "b.gt 1b \n" 2795 : "+r"(src_y0), // %0 2796 "+r"(src_y1), // %1 2797 "+r"(dst_sobely), // %2 2798 "+r"(width) // %3 2799 : "r"(1LL), // %4 2800 "r"(6LL) // %5 2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2802 ); 2803 } 2539 void SobelYRow_NEON(const uint8* src_y0, 2540 const uint8* src_y1, 2541 uint8* dst_sobely, 2542 int width) { 2543 asm volatile( 2544 "1: \n" 2545 "ld1 {v0.8b}, [%0],%4 \n" // left 2546 "ld1 {v1.8b}, [%1],%4 \n" 2547 "usubl v0.8h, v0.8b, v1.8b \n" 2548 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 2549 "ld1 {v3.8b}, [%1],%4 \n" 2550 "usubl v1.8h, v2.8b, v3.8b \n" 2551 "add v0.8h, v0.8h, v1.8h \n" 2552 "add v0.8h, v0.8h, v1.8h \n" 2553 "ld1 {v2.8b}, [%0],%5 \n" // right 2554 "ld1 {v3.8b}, [%1],%5 \n" 2555 "subs %w3, %w3, #8 \n" // 8 pixels 2556 "usubl v1.8h, v2.8b, v3.8b \n" 2557 "add v0.8h, v0.8h, v1.8h \n" 2558 "abs v0.8h, v0.8h \n" 2559 "uqxtn v0.8b, v0.8h \n" 2560 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 2561 "b.gt 1b \n" 2562 : "+r"(src_y0), // %0 2563 "+r"(src_y1), // %1 2564 "+r"(dst_sobely), // %2 2565 "+r"(width) // %3 2566 : "r"(1LL), // %4 2567 "r"(6LL) // %5 2568 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2569 ); 2570 } 2571 2572 // Caveat - rounds float to half float whereas scaling version truncates. 2573 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { 2574 asm volatile( 2575 "1: \n" 2576 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2577 "subs %w2, %w2, #8 \n" // 8 pixels per loop 2578 "uxtl v2.4s, v1.4h \n" // 8 int's 2579 "uxtl2 v3.4s, v1.8h \n" 2580 "scvtf v2.4s, v2.4s \n" // 8 floats 2581 "scvtf v3.4s, v3.4s \n" 2582 "fcvtn v1.4h, v2.4s \n" // 8 half floats 2583 "fcvtn2 v1.8h, v3.4s \n" 2584 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts 2585 "b.gt 1b \n" 2586 : "+r"(src), // %0 2587 "+r"(dst), // %1 2588 "+r"(width) // %2 2589 : 2590 : "cc", "memory", "v1", "v2", "v3"); 2591 } 2592 2593 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { 2594 asm volatile( 2595 "1: \n" 2596 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2597 "subs %w2, %w2, #8 \n" // 8 pixels per loop 2598 "uxtl v2.4s, v1.4h \n" // 8 int's 2599 "uxtl2 v3.4s, v1.8h \n" 2600 "scvtf v2.4s, v2.4s \n" // 8 floats 2601 "scvtf v3.4s, v3.4s \n" 2602 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent 2603 "fmul v3.4s, v3.4s, %3.s[0] \n" 2604 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat 2605 "uqshrn2 v1.8h, v3.4s, #13 \n" 2606 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts 2607 "b.gt 1b \n" 2608 : "+r"(src), // %0 2609 "+r"(dst), // %1 2610 "+r"(width) // %2 2611 : "w"(scale * 1.9259299444e-34f) // %3 2612 : "cc", "memory", "v1", "v2", "v3"); 2613 } 2614 2804 2615 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2805 2616
Note: See TracChangeset
for help on using the changeset viewer.