Changeset 5699 for pjproject/trunk/third_party/yuv/source/scale_neon64.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/scale_neon64.cc
r5633 r5699 27 27 int dst_width) { 28 28 (void)src_stride; 29 asm volatile 30 "1:\n"31 // load even pixels into v0, odd into v132 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"33 "subs %w2, %w2, #16 \n" // 16 processed per loop34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels35 "b.gt 1b \n"36 : "+r"(src_ptr),// %037 "+r"(dst),// %138 "+r"(dst_width)// %239 :40 : "v0", "v1"// Clobber List41 );29 asm volatile( 30 "1: \n" 31 // load even pixels into v0, odd into v1 32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 33 "subs %w2, %w2, #16 \n" // 16 processed per loop 34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 35 "b.gt 1b \n" 36 : "+r"(src_ptr), // %0 37 "+r"(dst), // %1 38 "+r"(dst_width) // %2 39 : 40 : "v0", "v1" // Clobber List 41 ); 42 42 } 43 43 … … 48 48 int dst_width) { 49 49 (void)src_stride; 50 asm volatile ( 51 "1: \n" 52 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc 53 "subs %w2, %w2, #16 \n" // 16 processed per loop 54 "uaddlp v0.8h, v0.16b \n" // add adjacent 55 "uaddlp v1.8h, v1.16b \n" 56 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 57 "rshrn2 v0.16b, v1.8h, #1 \n" 58 "st1 {v0.16b}, [%1], #16 \n" 59 "b.gt 1b \n" 60 : "+r"(src_ptr), // %0 61 "+r"(dst), // %1 62 "+r"(dst_width) // %2 63 : 64 : "v0", "v1" // Clobber List 65 ); 50 asm volatile( 51 "1: \n" 52 // load even pixels into v0, odd into v1 53 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 54 "subs %w2, %w2, #16 \n" // 16 processed per loop 55 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add 56 "st1 {v0.16b}, [%1], #16 \n" 57 "b.gt 1b \n" 58 : "+r"(src_ptr), // %0 59 "+r"(dst), // %1 60 "+r"(dst_width) // %2 61 : 62 : "v0", "v1" // Clobber List 63 ); 66 64 } 67 65 … … 71 69 uint8* dst, 72 70 int dst_width) { 73 asm volatile 74 // change the stride to row 2 pointer75 "add %1, %1, %0 \n"76 "1:\n"77 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc78 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc79 "subs %w3, %w3, #16 \n" // 16 processed per loop80 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent81 "uaddlp v1.8h, v1.16b \n"82 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row183 "uadalp v1.8h, v3.16b \n"84 "rshrn v0.8b, v0.8h, #2 \n" // downshift,round and pack85 "rshrn2 v0.16b, v1.8h, #2 \n"86 "st1 {v0.16b}, [%2], #16 \n"87 "b.gt 1b \n"88 : "+r"(src_ptr),// %089 "+r"(src_stride),// %190 "+r"(dst),// %291 "+r"(dst_width)// %392 :93 : "v0", "v1", "v2", "v3"// Clobber List94 );71 asm volatile( 72 // change the stride to row 2 pointer 73 "add %1, %1, %0 \n" 74 "1: \n" 75 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc 76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc 77 "subs %w3, %w3, #16 \n" // 16 processed per loop 78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent 79 "uaddlp v1.8h, v1.16b \n" 80 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent 81 "uadalp v1.8h, v3.16b \n" 82 "rshrn v0.8b, v0.8h, #2 \n" // round and pack 83 "rshrn2 v0.16b, v1.8h, #2 \n" 84 "st1 {v0.16b}, [%2], #16 \n" 85 "b.gt 1b \n" 86 : "+r"(src_ptr), // %0 87 "+r"(src_stride), // %1 88 "+r"(dst), // %2 89 "+r"(dst_width) // %3 90 : 91 : "v0", "v1", "v2", "v3" // Clobber List 92 ); 95 93 } 96 94 … … 100 98 int dst_width) { 101 99 (void)src_stride; 102 asm volatile ( 103 "1: \n" 104 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 105 "subs %w2, %w2, #8 \n" // 8 processed per loop 106 "st1 {v2.8b}, [%1], #8 \n" 107 "b.gt 1b \n" 108 : "+r"(src_ptr), // %0 109 "+r"(dst_ptr), // %1 110 "+r"(dst_width) // %2 111 : 112 : "v0", "v1", "v2", "v3", "memory", "cc" 113 ); 100 asm volatile( 101 "1: \n" 102 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 103 "subs %w2, %w2, #8 \n" // 8 processed per loop 104 "st1 {v2.8b}, [%1], #8 \n" 105 "b.gt 1b \n" 106 : "+r"(src_ptr), // %0 107 "+r"(dst_ptr), // %1 108 "+r"(dst_width) // %2 109 : 110 : "v0", "v1", "v2", "v3", "memory", "cc"); 114 111 } 115 112 … … 121 118 const uint8* src_ptr2 = src_ptr + src_stride * 2; 122 119 const uint8* src_ptr3 = src_ptr + src_stride * 3; 123 asm volatile ( 124 "1: \n" 125 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 126 "ld1 {v1.16b}, [%2], #16 \n" 127 "ld1 {v2.16b}, [%3], #16 \n" 128 "ld1 {v3.16b}, [%4], #16 \n" 129 "subs %w5, %w5, #4 \n" 130 "uaddlp v0.8h, v0.16b \n" 131 "uadalp v0.8h, v1.16b \n" 132 "uadalp v0.8h, v2.16b \n" 133 "uadalp v0.8h, v3.16b \n" 134 "addp v0.8h, v0.8h, v0.8h \n" 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 136 "st1 {v0.s}[0], [%1], #4 \n" 137 "b.gt 1b \n" 138 : "+r"(src_ptr), // %0 139 "+r"(dst_ptr), // %1 140 "+r"(src_ptr1), // %2 141 "+r"(src_ptr2), // %3 142 "+r"(src_ptr3), // %4 143 "+r"(dst_width) // %5 144 : 145 : "v0", "v1", "v2", "v3", "memory", "cc" 146 ); 120 asm volatile( 121 "1: \n" 122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 123 "ld1 {v1.16b}, [%2], #16 \n" 124 "ld1 {v2.16b}, [%3], #16 \n" 125 "ld1 {v3.16b}, [%4], #16 \n" 126 "subs %w5, %w5, #4 \n" 127 "uaddlp v0.8h, v0.16b \n" 128 "uadalp v0.8h, v1.16b \n" 129 "uadalp v0.8h, v2.16b \n" 130 "uadalp v0.8h, v3.16b \n" 131 "addp v0.8h, v0.8h, v0.8h \n" 132 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 133 "st1 {v0.s}[0], [%1], #4 \n" 134 "b.gt 1b \n" 135 : "+r"(src_ptr), // %0 136 "+r"(dst_ptr), // %1 137 "+r"(src_ptr1), // %2 138 "+r"(src_ptr2), // %3 139 "+r"(src_ptr3), // %4 140 "+r"(dst_width) // %5 141 : 142 : "v0", "v1", "v2", "v3", "memory", "cc"); 147 143 } 148 144 … … 155 151 int dst_width) { 156 152 (void)src_stride; 157 asm volatile ( 158 "1: \n" 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 160 "subs %w2, %w2, #24 \n" 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 162 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 163 "b.gt 1b \n" 164 : "+r"(src_ptr), // %0 165 "+r"(dst_ptr), // %1 166 "+r"(dst_width) // %2 167 : 168 : "v0", "v1", "v2", "v3", "memory", "cc" 169 ); 153 asm volatile( 154 "1: \n" 155 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 156 "subs %w2, %w2, #24 \n" 157 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 158 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 159 "b.gt 1b \n" 160 : "+r"(src_ptr), // %0 161 "+r"(dst_ptr), // %1 162 "+r"(dst_width) // %2 163 : 164 : "v0", "v1", "v2", "v3", "memory", "cc"); 170 165 } 171 166 … … 174 169 uint8* dst_ptr, 175 170 int dst_width) { 176 asm volatile ( 177 "movi v20.8b, #3 \n" 178 "add %3, %3, %0 \n" 179 "1: \n" 180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 181 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 182 "subs %w2, %w2, #24 \n" 183 184 // filter src line 0 with src line 1 185 // expand chars to shorts to allow for room 186 // when adding lines together 187 "ushll v16.8h, v4.8b, #0 \n" 188 "ushll v17.8h, v5.8b, #0 \n" 189 "ushll v18.8h, v6.8b, #0 \n" 190 "ushll v19.8h, v7.8b, #0 \n" 191 192 // 3 * line_0 + line_1 193 "umlal v16.8h, v0.8b, v20.8b \n" 194 "umlal v17.8h, v1.8b, v20.8b \n" 195 "umlal v18.8h, v2.8b, v20.8b \n" 196 "umlal v19.8h, v3.8b, v20.8b \n" 197 198 // (3 * line_0 + line_1) >> 2 199 "uqrshrn v0.8b, v16.8h, #2 \n" 200 "uqrshrn v1.8b, v17.8h, #2 \n" 201 "uqrshrn v2.8b, v18.8h, #2 \n" 202 "uqrshrn v3.8b, v19.8h, #2 \n" 203 204 // a0 = (src[0] * 3 + s[1] * 1) >> 2 205 "ushll v16.8h, v1.8b, #0 \n" 206 "umlal v16.8h, v0.8b, v20.8b \n" 207 "uqrshrn v0.8b, v16.8h, #2 \n" 208 209 // a1 = (src[1] * 1 + s[2] * 1) >> 1 210 "urhadd v1.8b, v1.8b, v2.8b \n" 211 212 // a2 = (src[2] * 1 + s[3] * 3) >> 2 213 "ushll v16.8h, v2.8b, #0 \n" 214 "umlal v16.8h, v3.8b, v20.8b \n" 215 "uqrshrn v2.8b, v16.8h, #2 \n" 216 217 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 218 219 "b.gt 1b \n" 220 : "+r"(src_ptr), // %0 221 "+r"(dst_ptr), // %1 222 "+r"(dst_width), // %2 223 "+r"(src_stride) // %3 224 : 225 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", 226 "v20", "memory", "cc" 227 ); 171 asm volatile( 172 "movi v20.8b, #3 \n" 173 "add %3, %3, %0 \n" 174 "1: \n" 175 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 176 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 177 "subs %w2, %w2, #24 \n" 178 179 // filter src line 0 with src line 1 180 // expand chars to shorts to allow for room 181 // when adding lines together 182 "ushll v16.8h, v4.8b, #0 \n" 183 "ushll v17.8h, v5.8b, #0 \n" 184 "ushll v18.8h, v6.8b, #0 \n" 185 "ushll v19.8h, v7.8b, #0 \n" 186 187 // 3 * line_0 + line_1 188 "umlal v16.8h, v0.8b, v20.8b \n" 189 "umlal v17.8h, v1.8b, v20.8b \n" 190 "umlal v18.8h, v2.8b, v20.8b \n" 191 "umlal v19.8h, v3.8b, v20.8b \n" 192 193 // (3 * line_0 + line_1) >> 2 194 "uqrshrn v0.8b, v16.8h, #2 \n" 195 "uqrshrn v1.8b, v17.8h, #2 \n" 196 "uqrshrn v2.8b, v18.8h, #2 \n" 197 "uqrshrn v3.8b, v19.8h, #2 \n" 198 199 // a0 = (src[0] * 3 + s[1] * 1) >> 2 200 "ushll v16.8h, v1.8b, #0 \n" 201 "umlal v16.8h, v0.8b, v20.8b \n" 202 "uqrshrn v0.8b, v16.8h, #2 \n" 203 204 // a1 = (src[1] * 1 + s[2] * 1) >> 1 205 "urhadd v1.8b, v1.8b, v2.8b \n" 206 207 // a2 = (src[2] * 1 + s[3] * 3) >> 2 208 "ushll v16.8h, v2.8b, #0 \n" 209 "umlal v16.8h, v3.8b, v20.8b \n" 210 "uqrshrn v2.8b, v16.8h, #2 \n" 211 212 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 213 214 "b.gt 1b \n" 215 : "+r"(src_ptr), // %0 216 "+r"(dst_ptr), // %1 217 "+r"(dst_width), // %2 218 "+r"(src_stride) // %3 219 : 220 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 221 "v19", "v20", "memory", "cc"); 228 222 } 229 223 … … 232 226 uint8* dst_ptr, 233 227 int dst_width) { 234 asm volatile ( 235 "movi v20.8b, #3 \n" 236 "add %3, %3, %0 \n" 237 "1: \n" 238 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 239 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 240 "subs %w2, %w2, #24 \n" 241 // average src line 0 with src line 1 242 "urhadd v0.8b, v0.8b, v4.8b \n" 243 "urhadd v1.8b, v1.8b, v5.8b \n" 244 "urhadd v2.8b, v2.8b, v6.8b \n" 245 "urhadd v3.8b, v3.8b, v7.8b \n" 246 247 // a0 = (src[0] * 3 + s[1] * 1) >> 2 248 "ushll v4.8h, v1.8b, #0 \n" 249 "umlal v4.8h, v0.8b, v20.8b \n" 250 "uqrshrn v0.8b, v4.8h, #2 \n" 251 252 // a1 = (src[1] * 1 + s[2] * 1) >> 1 253 "urhadd v1.8b, v1.8b, v2.8b \n" 254 255 // a2 = (src[2] * 1 + s[3] * 3) >> 2 256 "ushll v4.8h, v2.8b, #0 \n" 257 "umlal v4.8h, v3.8b, v20.8b \n" 258 "uqrshrn v2.8b, v4.8h, #2 \n" 259 260 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 261 "b.gt 1b \n" 262 : "+r"(src_ptr), // %0 263 "+r"(dst_ptr), // %1 264 "+r"(dst_width), // %2 265 "+r"(src_stride) // %3 266 : 267 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" 268 ); 228 asm volatile( 229 "movi v20.8b, #3 \n" 230 "add %3, %3, %0 \n" 231 "1: \n" 232 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 233 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 234 "subs %w2, %w2, #24 \n" 235 // average src line 0 with src line 1 236 "urhadd v0.8b, v0.8b, v4.8b \n" 237 "urhadd v1.8b, v1.8b, v5.8b \n" 238 "urhadd v2.8b, v2.8b, v6.8b \n" 239 "urhadd v3.8b, v3.8b, v7.8b \n" 240 241 // a0 = (src[0] * 3 + s[1] * 1) >> 2 242 "ushll v4.8h, v1.8b, #0 \n" 243 "umlal v4.8h, v0.8b, v20.8b \n" 244 "uqrshrn v0.8b, v4.8h, #2 \n" 245 246 // a1 = (src[1] * 1 + s[2] * 1) >> 1 247 "urhadd v1.8b, v1.8b, v2.8b \n" 248 249 // a2 = (src[2] * 1 + s[3] * 3) >> 2 250 "ushll v4.8h, v2.8b, #0 \n" 251 "umlal v4.8h, v3.8b, v20.8b \n" 252 "uqrshrn v2.8b, v4.8h, #2 \n" 253 254 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 255 "b.gt 1b \n" 256 : "+r"(src_ptr), // %0 257 "+r"(dst_ptr), // %1 258 "+r"(dst_width), // %2 259 "+r"(src_stride) // %3 260 : 261 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); 269 262 } 270 263 … … 283 276 int dst_width) { 284 277 (void)src_stride; 285 asm volatile ( 286 "ld1 {v3.16b}, [%3] \n" 287 "1: \n" 288 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 289 "subs %w2, %w2, #12 \n" 290 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 291 "st1 {v2.8b}, [%1], #8 \n" 292 "st1 {v2.s}[2], [%1], #4 \n" 293 "b.gt 1b \n" 294 : "+r"(src_ptr), // %0 295 "+r"(dst_ptr), // %1 296 "+r"(dst_width) // %2 297 : "r"(&kShuf38) // %3 298 : "v0", "v1", "v2", "v3", "memory", "cc" 299 ); 278 asm volatile( 279 "ld1 {v3.16b}, [%3] \n" 280 "1: \n" 281 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 282 "subs %w2, %w2, #12 \n" 283 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 284 "st1 {v2.8b}, [%1], #8 \n" 285 "st1 {v2.s}[2], [%1], #4 \n" 286 "b.gt 1b \n" 287 : "+r"(src_ptr), // %0 288 "+r"(dst_ptr), // %1 289 "+r"(dst_width) // %2 290 : "r"(&kShuf38) // %3 291 : "v0", "v1", "v2", "v3", "memory", "cc"); 300 292 } 301 293 … … 308 300 ptrdiff_t tmp_src_stride = src_stride; 309 301 310 asm volatile ( 311 "ld1 {v29.8h}, [%5] \n" 312 "ld1 {v30.16b}, [%6] \n" 313 "ld1 {v31.8h}, [%7] \n" 314 "add %2, %2, %0 \n" 315 "1: \n" 316 317 // 00 40 01 41 02 42 03 43 318 // 10 50 11 51 12 52 13 53 319 // 20 60 21 61 22 62 23 63 320 // 30 70 31 71 32 72 33 73 321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 322 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 323 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 324 "subs %w4, %w4, #12 \n" 325 326 // Shuffle the input data around to get align the data 327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 328 // 00 10 01 11 02 12 03 13 329 // 40 50 41 51 42 52 43 53 330 "trn1 v20.8b, v0.8b, v1.8b \n" 331 "trn2 v21.8b, v0.8b, v1.8b \n" 332 "trn1 v22.8b, v4.8b, v5.8b \n" 333 "trn2 v23.8b, v4.8b, v5.8b \n" 334 "trn1 v24.8b, v16.8b, v17.8b \n" 335 "trn2 v25.8b, v16.8b, v17.8b \n" 336 337 // 20 30 21 31 22 32 23 33 338 // 60 70 61 71 62 72 63 73 339 "trn1 v0.8b, v2.8b, v3.8b \n" 340 "trn2 v1.8b, v2.8b, v3.8b \n" 341 "trn1 v4.8b, v6.8b, v7.8b \n" 342 "trn2 v5.8b, v6.8b, v7.8b \n" 343 "trn1 v16.8b, v18.8b, v19.8b \n" 344 "trn2 v17.8b, v18.8b, v19.8b \n" 345 346 // 00+10 01+11 02+12 03+13 347 // 40+50 41+51 42+52 43+53 348 "uaddlp v20.4h, v20.8b \n" 349 "uaddlp v21.4h, v21.8b \n" 350 "uaddlp v22.4h, v22.8b \n" 351 "uaddlp v23.4h, v23.8b \n" 352 "uaddlp v24.4h, v24.8b \n" 353 "uaddlp v25.4h, v25.8b \n" 354 355 // 60+70 61+71 62+72 63+73 356 "uaddlp v1.4h, v1.8b \n" 357 "uaddlp v5.4h, v5.8b \n" 358 "uaddlp v17.4h, v17.8b \n" 359 360 // combine source lines 361 "add v20.4h, v20.4h, v22.4h \n" 362 "add v21.4h, v21.4h, v23.4h \n" 363 "add v20.4h, v20.4h, v24.4h \n" 364 "add v21.4h, v21.4h, v25.4h \n" 365 "add v2.4h, v1.4h, v5.4h \n" 366 "add v2.4h, v2.4h, v17.4h \n" 367 368 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 369 // + s[6 + st * 1] + s[7 + st * 1] 370 // + s[6 + st * 2] + s[7 + st * 2]) / 6 371 "sqrdmulh v2.8h, v2.8h, v29.8h \n" 372 "xtn v2.8b, v2.8h \n" 373 374 // Shuffle 2,3 reg around so that 2 can be added to the 375 // 0,1 reg and 3 can be added to the 4,5 reg. This 376 // requires expanding from u8 to u16 as the 0,1 and 4,5 377 // registers are already expanded. Then do transposes 378 // to get aligned. 379 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 380 "ushll v16.8h, v16.8b, #0 \n" 381 "uaddl v0.8h, v0.8b, v4.8b \n" 382 383 // combine source lines 384 "add v0.8h, v0.8h, v16.8h \n" 385 386 // xx 20 xx 21 xx 22 xx 23 387 // xx 30 xx 31 xx 32 xx 33 388 "trn1 v1.8h, v0.8h, v0.8h \n" 389 "trn2 v4.8h, v0.8h, v0.8h \n" 390 "xtn v0.4h, v1.4s \n" 391 "xtn v4.4h, v4.4s \n" 392 393 // 0+1+2, 3+4+5 394 "add v20.8h, v20.8h, v0.8h \n" 395 "add v21.8h, v21.8h, v4.8h \n" 396 397 // Need to divide, but can't downshift as the the value 398 // isn't a power of 2. So multiply by 65536 / n 399 // and take the upper 16 bits. 400 "sqrdmulh v0.8h, v20.8h, v31.8h \n" 401 "sqrdmulh v1.8h, v21.8h, v31.8h \n" 402 403 // Align for table lookup, vtbl requires registers to 404 // be adjacent 405 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 406 407 "st1 {v3.8b}, [%1], #8 \n" 408 "st1 {v3.s}[2], [%1], #4 \n" 409 "b.gt 1b \n" 410 : "+r"(src_ptr), // %0 411 "+r"(dst_ptr), // %1 412 "+r"(tmp_src_stride), // %2 413 "+r"(src_ptr1), // %3 414 "+r"(dst_width) // %4 415 : "r"(&kMult38_Div6), // %5 416 "r"(&kShuf38_2), // %6 417 "r"(&kMult38_Div9) // %7 418 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 419 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", 420 "v30", "v31", "memory", "cc" 421 ); 302 asm volatile( 303 "ld1 {v29.8h}, [%5] \n" 304 "ld1 {v30.16b}, [%6] \n" 305 "ld1 {v31.8h}, [%7] \n" 306 "add %2, %2, %0 \n" 307 "1: \n" 308 309 // 00 40 01 41 02 42 03 43 310 // 10 50 11 51 12 52 13 53 311 // 20 60 21 61 22 62 23 63 312 // 30 70 31 71 32 72 33 73 313 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 314 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 315 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 316 "subs %w4, %w4, #12 \n" 317 318 // Shuffle the input data around to get align the data 319 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 320 // 00 10 01 11 02 12 03 13 321 // 40 50 41 51 42 52 43 53 322 "trn1 v20.8b, v0.8b, v1.8b \n" 323 "trn2 v21.8b, v0.8b, v1.8b \n" 324 "trn1 v22.8b, v4.8b, v5.8b \n" 325 "trn2 v23.8b, v4.8b, v5.8b \n" 326 "trn1 v24.8b, v16.8b, v17.8b \n" 327 "trn2 v25.8b, v16.8b, v17.8b \n" 328 329 // 20 30 21 31 22 32 23 33 330 // 60 70 61 71 62 72 63 73 331 "trn1 v0.8b, v2.8b, v3.8b \n" 332 "trn2 v1.8b, v2.8b, v3.8b \n" 333 "trn1 v4.8b, v6.8b, v7.8b \n" 334 "trn2 v5.8b, v6.8b, v7.8b \n" 335 "trn1 v16.8b, v18.8b, v19.8b \n" 336 "trn2 v17.8b, v18.8b, v19.8b \n" 337 338 // 00+10 01+11 02+12 03+13 339 // 40+50 41+51 42+52 43+53 340 "uaddlp v20.4h, v20.8b \n" 341 "uaddlp v21.4h, v21.8b \n" 342 "uaddlp v22.4h, v22.8b \n" 343 "uaddlp v23.4h, v23.8b \n" 344 "uaddlp v24.4h, v24.8b \n" 345 "uaddlp v25.4h, v25.8b \n" 346 347 // 60+70 61+71 62+72 63+73 348 "uaddlp v1.4h, v1.8b \n" 349 "uaddlp v5.4h, v5.8b \n" 350 "uaddlp v17.4h, v17.8b \n" 351 352 // combine source lines 353 "add v20.4h, v20.4h, v22.4h \n" 354 "add v21.4h, v21.4h, v23.4h \n" 355 "add v20.4h, v20.4h, v24.4h \n" 356 "add v21.4h, v21.4h, v25.4h \n" 357 "add v2.4h, v1.4h, v5.4h \n" 358 "add v2.4h, v2.4h, v17.4h \n" 359 360 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 361 // + s[6 + st * 1] + s[7 + st * 1] 362 // + s[6 + st * 2] + s[7 + st * 2]) / 6 363 "sqrdmulh v2.8h, v2.8h, v29.8h \n" 364 "xtn v2.8b, v2.8h \n" 365 366 // Shuffle 2,3 reg around so that 2 can be added to the 367 // 0,1 reg and 3 can be added to the 4,5 reg. This 368 // requires expanding from u8 to u16 as the 0,1 and 4,5 369 // registers are already expanded. Then do transposes 370 // to get aligned. 371 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 372 "ushll v16.8h, v16.8b, #0 \n" 373 "uaddl v0.8h, v0.8b, v4.8b \n" 374 375 // combine source lines 376 "add v0.8h, v0.8h, v16.8h \n" 377 378 // xx 20 xx 21 xx 22 xx 23 379 // xx 30 xx 31 xx 32 xx 33 380 "trn1 v1.8h, v0.8h, v0.8h \n" 381 "trn2 v4.8h, v0.8h, v0.8h \n" 382 "xtn v0.4h, v1.4s \n" 383 "xtn v4.4h, v4.4s \n" 384 385 // 0+1+2, 3+4+5 386 "add v20.8h, v20.8h, v0.8h \n" 387 "add v21.8h, v21.8h, v4.8h \n" 388 389 // Need to divide, but can't downshift as the the value 390 // isn't a power of 2. So multiply by 65536 / n 391 // and take the upper 16 bits. 392 "sqrdmulh v0.8h, v20.8h, v31.8h \n" 393 "sqrdmulh v1.8h, v21.8h, v31.8h \n" 394 395 // Align for table lookup, vtbl requires registers to be adjacent 396 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 397 398 "st1 {v3.8b}, [%1], #8 \n" 399 "st1 {v3.s}[2], [%1], #4 \n" 400 "b.gt 1b \n" 401 : "+r"(src_ptr), // %0 402 "+r"(dst_ptr), // %1 403 "+r"(tmp_src_stride), // %2 404 "+r"(src_ptr1), // %3 405 "+r"(dst_width) // %4 406 : "r"(&kMult38_Div6), // %5 407 "r"(&kShuf38_2), // %6 408 "r"(&kMult38_Div9) // %7 409 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 410 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", 411 "memory", "cc"); 422 412 } 423 413 … … 429 419 // TODO(fbarchard): use src_stride directly for clang 3.5+. 430 420 ptrdiff_t tmp_src_stride = src_stride; 431 asm volatile ( 432 "ld1 {v30.8h}, [%4] \n" 433 "ld1 {v31.16b}, [%5] \n" 434 "add %2, %2, %0 \n" 435 "1: \n" 436 437 // 00 40 01 41 02 42 03 43 438 // 10 50 11 51 12 52 13 53 439 // 20 60 21 61 22 62 23 63 440 // 30 70 31 71 32 72 33 73 441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 442 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 443 "subs %w3, %w3, #12 \n" 444 445 // Shuffle the input data around to get align the data 446 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 447 // 00 10 01 11 02 12 03 13 448 // 40 50 41 51 42 52 43 53 449 "trn1 v16.8b, v0.8b, v1.8b \n" 450 "trn2 v17.8b, v0.8b, v1.8b \n" 451 "trn1 v18.8b, v4.8b, v5.8b \n" 452 "trn2 v19.8b, v4.8b, v5.8b \n" 453 454 // 20 30 21 31 22 32 23 33 455 // 60 70 61 71 62 72 63 73 456 "trn1 v0.8b, v2.8b, v3.8b \n" 457 "trn2 v1.8b, v2.8b, v3.8b \n" 458 "trn1 v4.8b, v6.8b, v7.8b \n" 459 "trn2 v5.8b, v6.8b, v7.8b \n" 460 461 // 00+10 01+11 02+12 03+13 462 // 40+50 41+51 42+52 43+53 463 "uaddlp v16.4h, v16.8b \n" 464 "uaddlp v17.4h, v17.8b \n" 465 "uaddlp v18.4h, v18.8b \n" 466 "uaddlp v19.4h, v19.8b \n" 467 468 // 60+70 61+71 62+72 63+73 469 "uaddlp v1.4h, v1.8b \n" 470 "uaddlp v5.4h, v5.8b \n" 471 472 // combine source lines 473 "add v16.4h, v16.4h, v18.4h \n" 474 "add v17.4h, v17.4h, v19.4h \n" 475 "add v2.4h, v1.4h, v5.4h \n" 476 477 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 478 "uqrshrn v2.8b, v2.8h, #2 \n" 479 480 // Shuffle 2,3 reg around so that 2 can be added to the 481 // 0,1 reg and 3 can be added to the 4,5 reg. This 482 // requires expanding from u8 to u16 as the 0,1 and 4,5 483 // registers are already expanded. Then do transposes 484 // to get aligned. 485 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 486 487 // combine source lines 488 "uaddl v0.8h, v0.8b, v4.8b \n" 489 490 // xx 20 xx 21 xx 22 xx 23 491 // xx 30 xx 31 xx 32 xx 33 492 "trn1 v1.8h, v0.8h, v0.8h \n" 493 "trn2 v4.8h, v0.8h, v0.8h \n" 494 "xtn v0.4h, v1.4s \n" 495 "xtn v4.4h, v4.4s \n" 496 497 // 0+1+2, 3+4+5 498 "add v16.8h, v16.8h, v0.8h \n" 499 "add v17.8h, v17.8h, v4.8h \n" 500 501 // Need to divide, but can't downshift as the the value 502 // isn't a power of 2. So multiply by 65536 / n 503 // and take the upper 16 bits. 504 "sqrdmulh v0.8h, v16.8h, v30.8h \n" 505 "sqrdmulh v1.8h, v17.8h, v30.8h \n" 506 507 // Align for table lookup, vtbl requires registers to 508 // be adjacent 509 510 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 511 512 "st1 {v3.8b}, [%1], #8 \n" 513 "st1 {v3.s}[2], [%1], #4 \n" 514 "b.gt 1b \n" 515 : "+r"(src_ptr), // %0 516 "+r"(dst_ptr), // %1 517 "+r"(tmp_src_stride), // %2 518 "+r"(dst_width) // %3 519 : "r"(&kMult38_Div6), // %4 520 "r"(&kShuf38_2) // %5 521 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 522 "v18", "v19", "v30", "v31", "memory", "cc" 523 ); 421 asm volatile( 422 "ld1 {v30.8h}, [%4] \n" 423 "ld1 {v31.16b}, [%5] \n" 424 "add %2, %2, %0 \n" 425 "1: \n" 426 427 // 00 40 01 41 02 42 03 43 428 // 10 50 11 51 12 52 13 53 429 // 20 60 21 61 22 62 23 63 430 // 30 70 31 71 32 72 33 73 431 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 432 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 433 "subs %w3, %w3, #12 \n" 434 435 // Shuffle the input data around to get align the data 436 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 437 // 00 10 01 11 02 12 03 13 438 // 40 50 41 51 42 52 43 53 439 "trn1 v16.8b, v0.8b, v1.8b \n" 440 "trn2 v17.8b, v0.8b, v1.8b \n" 441 "trn1 v18.8b, v4.8b, v5.8b \n" 442 "trn2 v19.8b, v4.8b, v5.8b \n" 443 444 // 20 30 21 31 22 32 23 33 445 // 60 70 61 71 62 72 63 73 446 "trn1 v0.8b, v2.8b, v3.8b \n" 447 "trn2 v1.8b, v2.8b, v3.8b \n" 448 "trn1 v4.8b, v6.8b, v7.8b \n" 449 "trn2 v5.8b, v6.8b, v7.8b \n" 450 451 // 00+10 01+11 02+12 03+13 452 // 40+50 41+51 42+52 43+53 453 "uaddlp v16.4h, v16.8b \n" 454 "uaddlp v17.4h, v17.8b \n" 455 "uaddlp v18.4h, v18.8b \n" 456 "uaddlp v19.4h, v19.8b \n" 457 458 // 60+70 61+71 62+72 63+73 459 "uaddlp v1.4h, v1.8b \n" 460 "uaddlp v5.4h, v5.8b \n" 461 462 // combine source lines 463 "add v16.4h, v16.4h, v18.4h \n" 464 "add v17.4h, v17.4h, v19.4h \n" 465 "add v2.4h, v1.4h, v5.4h \n" 466 467 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 468 "uqrshrn v2.8b, v2.8h, #2 \n" 469 470 // Shuffle 2,3 reg around so that 2 can be added to the 471 // 0,1 reg and 3 can be added to the 4,5 reg. This 472 // requires expanding from u8 to u16 as the 0,1 and 4,5 473 // registers are already expanded. Then do transposes 474 // to get aligned. 475 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 476 477 // combine source lines 478 "uaddl v0.8h, v0.8b, v4.8b \n" 479 480 // xx 20 xx 21 xx 22 xx 23 481 // xx 30 xx 31 xx 32 xx 33 482 "trn1 v1.8h, v0.8h, v0.8h \n" 483 "trn2 v4.8h, v0.8h, v0.8h \n" 484 "xtn v0.4h, v1.4s \n" 485 "xtn v4.4h, v4.4s \n" 486 487 // 0+1+2, 3+4+5 488 "add v16.8h, v16.8h, v0.8h \n" 489 "add v17.8h, v17.8h, v4.8h \n" 490 491 // Need to divide, but can't downshift as the the value 492 // isn't a power of 2. So multiply by 65536 / n 493 // and take the upper 16 bits. 494 "sqrdmulh v0.8h, v16.8h, v30.8h \n" 495 "sqrdmulh v1.8h, v17.8h, v30.8h \n" 496 497 // Align for table lookup, vtbl requires registers to 498 // be adjacent 499 500 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 501 502 "st1 {v3.8b}, [%1], #8 \n" 503 "st1 {v3.s}[2], [%1], #4 \n" 504 "b.gt 1b \n" 505 : "+r"(src_ptr), // %0 506 "+r"(dst_ptr), // %1 507 "+r"(tmp_src_stride), // %2 508 "+r"(dst_width) // %3 509 : "r"(&kMult38_Div6), // %4 510 "r"(&kShuf38_2) // %5 511 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 512 "v19", "v30", "v31", "memory", "cc"); 524 513 } 525 514 … … 530 519 int src_height) { 531 520 const uint8* src_tmp; 532 asm volatile ( 533 "1: \n" 534 "mov %0, %1 \n" 535 "mov w12, %w5 \n" 536 "eor v2.16b, v2.16b, v2.16b \n" 537 "eor v3.16b, v3.16b, v3.16b \n" 538 "2: \n" 539 // load 16 pixels into q0 540 "ld1 {v0.16b}, [%0], %3 \n" 541 "uaddw2 v3.8h, v3.8h, v0.16b \n" 542 "uaddw v2.8h, v2.8h, v0.8b \n" 543 "subs w12, w12, #1 \n" 544 "b.gt 2b \n" 545 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 546 "add %1, %1, #16 \n" 547 "subs %w4, %w4, #16 \n" // 16 processed per loop 548 "b.gt 1b \n" 549 : "=&r"(src_tmp), // %0 550 "+r"(src_ptr), // %1 551 "+r"(dst_ptr), // %2 552 "+r"(src_stride), // %3 553 "+r"(src_width), // %4 554 "+r"(src_height) // %5 555 : 556 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 557 ); 558 } 559 560 // clang-format off 521 asm volatile( 522 "1: \n" 523 "mov %0, %1 \n" 524 "mov w12, %w5 \n" 525 "eor v2.16b, v2.16b, v2.16b \n" 526 "eor v3.16b, v3.16b, v3.16b \n" 527 "2: \n" 528 // load 16 pixels into q0 529 "ld1 {v0.16b}, [%0], %3 \n" 530 "uaddw2 v3.8h, v3.8h, v0.16b \n" 531 "uaddw v2.8h, v2.8h, v0.8b \n" 532 "subs w12, w12, #1 \n" 533 "b.gt 2b \n" 534 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 535 "add %1, %1, #16 \n" 536 "subs %w4, %w4, #16 \n" // 16 processed per loop 537 "b.gt 1b \n" 538 : "=&r"(src_tmp), // %0 539 "+r"(src_ptr), // %1 540 "+r"(dst_ptr), // %2 541 "+r"(src_stride), // %3 542 "+r"(src_width), // %4 543 "+r"(src_height) // %5 544 : 545 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 546 ); 547 } 548 561 549 // TODO(Yang Zhang): Investigate less load instructions for 562 550 // the x/dx stepping 563 #define LOAD2_DATA8_LANE(n) 564 "lsr %5, %3, #16 \n" 565 "add %6, %1, %5 \n" 566 "add %3, %3, %4 \n" 551 #define LOAD2_DATA8_LANE(n) \ 552 "lsr %5, %3, #16 \n" \ 553 "add %6, %1, %5 \n" \ 554 "add %3, %3, %4 \n" \ 567 555 "ld2 {v4.b, v5.b}[" #n "], [%6] \n" 568 // clang-format on569 556 570 557 // The NEON version mimics this formula (from row_common.cc): … … 580 567 int* tmp = dx_offset; 581 568 const uint8* src_tmp = src_ptr; 582 int64 x64 = (int64)x; 583 int64 dx64 = (int64)dx; 569 int64 x64 = (int64)x; // NOLINT 570 int64 dx64 = (int64)dx; // NOLINT 584 571 asm volatile ( 585 572 "dup v0.4s, %w3 \n" // x … … 645 632 int source_y_fraction) { 646 633 int y_fraction = 256 - source_y_fraction; 647 asm volatile ( 648 "cmp %w4, #0 \n" 649 "b.eq 100f \n" 650 "add %2, %2, %1 \n" 651 "cmp %w4, #64 \n" 652 "b.eq 75f \n" 653 "cmp %w4, #128 \n" 654 "b.eq 50f \n" 655 "cmp %w4, #192 \n" 656 "b.eq 25f \n" 657 658 "dup v5.8b, %w4 \n" 659 "dup v4.8b, %w5 \n" 660 // General purpose row blend. 661 "1: \n" 662 "ld1 {v0.16b}, [%1], #16 \n" 663 "ld1 {v1.16b}, [%2], #16 \n" 664 "subs %w3, %w3, #16 \n" 665 "umull v6.8h, v0.8b, v4.8b \n" 666 "umull2 v7.8h, v0.16b, v4.16b \n" 667 "umlal v6.8h, v1.8b, v5.8b \n" 668 "umlal2 v7.8h, v1.16b, v5.16b \n" 669 "rshrn v0.8b, v6.8h, #8 \n" 670 "rshrn2 v0.16b, v7.8h, #8 \n" 671 "st1 {v0.16b}, [%0], #16 \n" 672 "b.gt 1b \n" 673 "b 99f \n" 674 675 // Blend 25 / 75. 676 "25: \n" 677 "ld1 {v0.16b}, [%1], #16 \n" 678 "ld1 {v1.16b}, [%2], #16 \n" 679 "subs %w3, %w3, #16 \n" 680 "urhadd v0.16b, v0.16b, v1.16b \n" 681 "urhadd v0.16b, v0.16b, v1.16b \n" 682 "st1 {v0.16b}, [%0], #16 \n" 683 "b.gt 25b \n" 684 "b 99f \n" 685 686 // Blend 50 / 50. 687 "50: \n" 688 "ld1 {v0.16b}, [%1], #16 \n" 689 "ld1 {v1.16b}, [%2], #16 \n" 690 "subs %w3, %w3, #16 \n" 691 "urhadd v0.16b, v0.16b, v1.16b \n" 692 "st1 {v0.16b}, [%0], #16 \n" 693 "b.gt 50b \n" 694 "b 99f \n" 695 696 // Blend 75 / 25. 697 "75: \n" 698 "ld1 {v1.16b}, [%1], #16 \n" 699 "ld1 {v0.16b}, [%2], #16 \n" 700 "subs %w3, %w3, #16 \n" 701 "urhadd v0.16b, v0.16b, v1.16b \n" 702 "urhadd v0.16b, v0.16b, v1.16b \n" 703 "st1 {v0.16b}, [%0], #16 \n" 704 "b.gt 75b \n" 705 "b 99f \n" 706 707 // Blend 100 / 0 - Copy row unchanged. 708 "100: \n" 709 "ld1 {v0.16b}, [%1], #16 \n" 710 "subs %w3, %w3, #16 \n" 711 "st1 {v0.16b}, [%0], #16 \n" 712 "b.gt 100b \n" 713 714 "99: \n" 715 "st1 {v0.b}[15], [%0] \n" 716 : "+r"(dst_ptr), // %0 717 "+r"(src_ptr), // %1 718 "+r"(src_stride), // %2 719 "+r"(dst_width), // %3 720 "+r"(source_y_fraction),// %4 721 "+r"(y_fraction) // %5 722 : 723 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" 724 ); 634 asm volatile( 635 "cmp %w4, #0 \n" 636 "b.eq 100f \n" 637 "add %2, %2, %1 \n" 638 "cmp %w4, #64 \n" 639 "b.eq 75f \n" 640 "cmp %w4, #128 \n" 641 "b.eq 50f \n" 642 "cmp %w4, #192 \n" 643 "b.eq 25f \n" 644 645 "dup v5.8b, %w4 \n" 646 "dup v4.8b, %w5 \n" 647 // General purpose row blend. 648 "1: \n" 649 "ld1 {v0.16b}, [%1], #16 \n" 650 "ld1 {v1.16b}, [%2], #16 \n" 651 "subs %w3, %w3, #16 \n" 652 "umull v6.8h, v0.8b, v4.8b \n" 653 "umull2 v7.8h, v0.16b, v4.16b \n" 654 "umlal v6.8h, v1.8b, v5.8b \n" 655 "umlal2 v7.8h, v1.16b, v5.16b \n" 656 "rshrn v0.8b, v6.8h, #8 \n" 657 "rshrn2 v0.16b, v7.8h, #8 \n" 658 "st1 {v0.16b}, [%0], #16 \n" 659 "b.gt 1b \n" 660 "b 99f \n" 661 662 // Blend 25 / 75. 663 "25: \n" 664 "ld1 {v0.16b}, [%1], #16 \n" 665 "ld1 {v1.16b}, [%2], #16 \n" 666 "subs %w3, %w3, #16 \n" 667 "urhadd v0.16b, v0.16b, v1.16b \n" 668 "urhadd v0.16b, v0.16b, v1.16b \n" 669 "st1 {v0.16b}, [%0], #16 \n" 670 "b.gt 25b \n" 671 "b 99f \n" 672 673 // Blend 50 / 50. 674 "50: \n" 675 "ld1 {v0.16b}, [%1], #16 \n" 676 "ld1 {v1.16b}, [%2], #16 \n" 677 "subs %w3, %w3, #16 \n" 678 "urhadd v0.16b, v0.16b, v1.16b \n" 679 "st1 {v0.16b}, [%0], #16 \n" 680 "b.gt 50b \n" 681 "b 99f \n" 682 683 // Blend 75 / 25. 684 "75: \n" 685 "ld1 {v1.16b}, [%1], #16 \n" 686 "ld1 {v0.16b}, [%2], #16 \n" 687 "subs %w3, %w3, #16 \n" 688 "urhadd v0.16b, v0.16b, v1.16b \n" 689 "urhadd v0.16b, v0.16b, v1.16b \n" 690 "st1 {v0.16b}, [%0], #16 \n" 691 "b.gt 75b \n" 692 "b 99f \n" 693 694 // Blend 100 / 0 - Copy row unchanged. 695 "100: \n" 696 "ld1 {v0.16b}, [%1], #16 \n" 697 "subs %w3, %w3, #16 \n" 698 "st1 {v0.16b}, [%0], #16 \n" 699 "b.gt 100b \n" 700 701 "99: \n" 702 "st1 {v0.b}[15], [%0] \n" 703 : "+r"(dst_ptr), // %0 704 "+r"(src_ptr), // %1 705 "+r"(src_stride), // %2 706 "+r"(dst_width), // %3 707 "+r"(source_y_fraction), // %4 708 "+r"(y_fraction) // %5 709 : 710 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); 725 711 } 726 712 … … 730 716 int dst_width) { 731 717 (void)src_stride; 732 asm volatile ( 733 "1: \n" 734 // load even pixels into q0, odd into q1 735 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" 736 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" 737 "subs %w2, %w2, #8 \n" // 8 processed per loop 738 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 739 "st1 {v3.16b}, [%1], #16 \n" 740 "b.gt 1b \n" 741 : "+r" (src_ptr), // %0 742 "+r" (dst), // %1 743 "+r" (dst_width) // %2 744 : 745 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 746 ); 718 asm volatile( 719 "1: \n" 720 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 721 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" 722 "subs %w2, %w2, #8 \n" // 8 processed per loop 723 "mov v2.16b, v3.16b \n" 724 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels 725 "b.gt 1b \n" 726 : "+r"(src_ptr), // %0 727 "+r"(dst), // %1 728 "+r"(dst_width) // %2 729 : 730 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 731 ); 747 732 } 748 733 … … 752 737 int dst_width) { 753 738 (void)src_stride; 754 asm volatile ( 755 "1: \n" 756 // load 8 ARGB pixels. 757 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" 758 "subs %w2, %w2, #8 \n" // 8 processed per loop. 759 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 760 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 761 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 762 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 763 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 764 "rshrn v1.8b, v1.8h, #1 \n" 765 "rshrn v2.8b, v2.8h, #1 \n" 766 "rshrn v3.8b, v3.8h, #1 \n" 767 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" 768 "b.gt 1b \n" 769 : "+r"(src_argb), // %0 770 "+r"(dst_argb), // %1 771 "+r"(dst_width) // %2 772 : 773 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 774 ); 739 asm volatile( 740 "1: \n" 741 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 742 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" 743 "subs %w2, %w2, #8 \n" // 8 processed per loop 744 745 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add 746 "urhadd v1.16b, v2.16b, v3.16b \n" 747 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels 748 "b.gt 1b \n" 749 : "+r"(src_argb), // %0 750 "+r"(dst_argb), // %1 751 "+r"(dst_width) // %2 752 : 753 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 754 ); 775 755 } 776 756 … … 779 759 uint8* dst, 780 760 int dst_width) { 781 asm volatile ( 782 // change the stride to row 2 pointer 783 "add %1, %1, %0 \n" 784 "1: \n" 785 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. 786 "subs %w3, %w3, #8 \n" // 8 processed per loop. 787 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 788 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 789 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 790 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 791 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. 792 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 793 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 794 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 795 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 796 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 797 "rshrn v1.8b, v1.8h, #2 \n" 798 "rshrn v2.8b, v2.8h, #2 \n" 799 "rshrn v3.8b, v3.8h, #2 \n" 800 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 801 "b.gt 1b \n" 802 : "+r" (src_ptr), // %0 803 "+r" (src_stride), // %1 804 "+r" (dst), // %2 805 "+r" (dst_width) // %3 806 : 807 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" 808 ); 761 asm volatile( 762 // change the stride to row 2 pointer 763 "add %1, %1, %0 \n" 764 "1: \n" 765 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB 766 "subs %w3, %w3, #8 \n" // 8 processed per loop. 767 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 768 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 769 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 770 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 771 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 772 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 773 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 774 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 775 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 776 "rshrn v0.8b, v0.8h, #2 \n" // round and pack 777 "rshrn v1.8b, v1.8h, #2 \n" 778 "rshrn v2.8b, v2.8h, #2 \n" 779 "rshrn v3.8b, v3.8h, #2 \n" 780 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 781 "b.gt 1b \n" 782 : "+r"(src_ptr), // %0 783 "+r"(src_stride), // %1 784 "+r"(dst), // %2 785 "+r"(dst_width) // %3 786 : 787 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); 809 788 } 810 789 … … 817 796 int dst_width) { 818 797 (void)src_stride; 819 asm volatile ( 820 "1: \n" 821 "ld1 {v0.s}[0], [%0], %3 \n" 822 "ld1 {v0.s}[1], [%0], %3 \n" 823 "ld1 {v0.s}[2], [%0], %3 \n" 824 "ld1 {v0.s}[3], [%0], %3 \n" 825 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 826 "st1 {v0.16b}, [%1], #16 \n" 827 "b.gt 1b \n" 828 : "+r"(src_argb), // %0 829 "+r"(dst_argb), // %1 830 "+r"(dst_width) // %2 831 : "r"((int64)(src_stepx * 4)) // %3 832 : "memory", "cc", "v0" 833 ); 798 asm volatile( 799 "1: \n" 800 "ld1 {v0.s}[0], [%0], %3 \n" 801 "ld1 {v0.s}[1], [%0], %3 \n" 802 "ld1 {v0.s}[2], [%0], %3 \n" 803 "ld1 {v0.s}[3], [%0], %3 \n" 804 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 805 "st1 {v0.16b}, [%1], #16 \n" 806 "b.gt 1b \n" 807 : "+r"(src_argb), // %0 808 "+r"(dst_argb), // %1 809 "+r"(dst_width) // %2 810 : "r"((int64)(src_stepx * 4)) // %3 811 : "memory", "cc", "v0"); 834 812 } 835 813 … … 843 821 uint8* dst_argb, 844 822 int dst_width) { 845 asm volatile ( 846 "add %1, %1, %0 \n" 847 "1: \n" 848 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 849 "ld1 {v1.8b}, [%1], %4 \n" 850 "ld1 {v2.8b}, [%0], %4 \n" 851 "ld1 {v3.8b}, [%1], %4 \n" 852 "ld1 {v4.8b}, [%0], %4 \n" 853 "ld1 {v5.8b}, [%1], %4 \n" 854 "ld1 {v6.8b}, [%0], %4 \n" 855 "ld1 {v7.8b}, [%1], %4 \n" 856 "uaddl v0.8h, v0.8b, v1.8b \n" 857 "uaddl v2.8h, v2.8b, v3.8b \n" 858 "uaddl v4.8h, v4.8b, v5.8b \n" 859 "uaddl v6.8h, v6.8b, v7.8b \n" 860 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 861 "mov v0.d[1], v2.d[0] \n" 862 "mov v2.d[0], v16.d[1] \n" 863 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 864 "mov v4.d[1], v6.d[0] \n" 865 "mov v6.d[0], v16.d[1] \n" 866 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 867 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 868 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 869 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 870 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 871 "st1 {v0.16b}, [%2], #16 \n" 872 "b.gt 1b \n" 873 : "+r"(src_argb), // %0 874 "+r"(src_stride), // %1 875 "+r"(dst_argb), // %2 876 "+r"(dst_width) // %3 877 : "r"((int64)(src_stepx * 4)) // %4 878 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 879 ); 880 } 881 882 // clang-format off 823 asm volatile( 824 "add %1, %1, %0 \n" 825 "1: \n" 826 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 827 "ld1 {v1.8b}, [%1], %4 \n" 828 "ld1 {v2.8b}, [%0], %4 \n" 829 "ld1 {v3.8b}, [%1], %4 \n" 830 "ld1 {v4.8b}, [%0], %4 \n" 831 "ld1 {v5.8b}, [%1], %4 \n" 832 "ld1 {v6.8b}, [%0], %4 \n" 833 "ld1 {v7.8b}, [%1], %4 \n" 834 "uaddl v0.8h, v0.8b, v1.8b \n" 835 "uaddl v2.8h, v2.8b, v3.8b \n" 836 "uaddl v4.8h, v4.8b, v5.8b \n" 837 "uaddl v6.8h, v6.8b, v7.8b \n" 838 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 839 "mov v0.d[1], v2.d[0] \n" 840 "mov v2.d[0], v16.d[1] \n" 841 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 842 "mov v4.d[1], v6.d[0] \n" 843 "mov v6.d[0], v16.d[1] \n" 844 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 845 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 846 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 847 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 848 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 849 "st1 {v0.16b}, [%2], #16 \n" 850 "b.gt 1b \n" 851 : "+r"(src_argb), // %0 852 "+r"(src_stride), // %1 853 "+r"(dst_argb), // %2 854 "+r"(dst_width) // %3 855 : "r"((int64)(src_stepx * 4)) // %4 856 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); 857 } 858 883 859 // TODO(Yang Zhang): Investigate less load instructions for 884 860 // the x/dx stepping 885 #define LOAD1_DATA32_LANE(vn, n) 886 "lsr %5, %3, #16 \n" 887 "add %6, %1, %5, lsl #2 \n" 888 "add %3, %3, %4 \n" 861 #define LOAD1_DATA32_LANE(vn, n) \ 862 "lsr %5, %3, #16 \n" \ 863 "add %6, %1, %5, lsl #2 \n" \ 864 "add %3, %3, %4 \n" \ 889 865 "ld1 {" #vn ".s}[" #n "], [%6] \n" 890 // clang-format on891 866 892 867 void ScaleARGBCols_NEON(uint8* dst_argb, … … 896 871 int dx) { 897 872 const uint8* src_tmp = src_argb; 898 int64 x64 = (int64)x; 899 int64 dx64 = (int64)dx; 873 int64 x64 = (int64)x; // NOLINT 874 int64 dx64 = (int64)dx; // NOLINT 900 875 int64 tmp64; 901 asm volatile 902 "1:\n"903 LOAD1_DATA32_LANE(v0, 0)904 LOAD1_DATA32_LANE(v0, 1)905 LOAD1_DATA32_LANE(v0, 2)906 LOAD1_DATA32_LANE(v0, 3)907 LOAD1_DATA32_LANE(v1, 0)908 LOAD1_DATA32_LANE(v1, 1)909 LOAD1_DATA32_LANE(v1, 2)910 LOAD1_DATA32_LANE(v1, 3)911 912 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels913 "subs %w2, %w2, #8 \n" // 8 processed per loop914 "b.gt 1b \n"915 : "+r"(dst_argb), // %0916 "+r"(src_argb), // %1917 "+r"(dst_width), // %2918 "+r"(x64), // %3919 "+r"(dx64), // %4920 "=&r"(tmp64), // %5921 "+r"(src_tmp) // %6922 :923 : "memory", "cc", "v0", "v1"924 );876 asm volatile( 877 "1: \n" 878 // clang-format off 879 LOAD1_DATA32_LANE(v0, 0) 880 LOAD1_DATA32_LANE(v0, 1) 881 LOAD1_DATA32_LANE(v0, 2) 882 LOAD1_DATA32_LANE(v0, 3) 883 LOAD1_DATA32_LANE(v1, 0) 884 LOAD1_DATA32_LANE(v1, 1) 885 LOAD1_DATA32_LANE(v1, 2) 886 LOAD1_DATA32_LANE(v1, 3) 887 // clang-format on 888 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels 889 "subs %w2, %w2, #8 \n" // 8 processed per loop 890 "b.gt 1b \n" 891 : "+r"(dst_argb), // %0 892 "+r"(src_argb), // %1 893 "+r"(dst_width), // %2 894 "+r"(x64), // %3 895 "+r"(dx64), // %4 896 "=&r"(tmp64), // %5 897 "+r"(src_tmp) // %6 898 : 899 : "memory", "cc", "v0", "v1"); 925 900 } 926 901 927 902 #undef LOAD1_DATA32_LANE 928 903 929 // clang-format off930 904 // TODO(Yang Zhang): Investigate less load instructions for 931 905 // the x/dx stepping 932 #define LOAD2_DATA32_LANE(vn1, vn2, n) 933 "lsr %5, %3, #16 \n" 934 "add %6, %1, %5, lsl #2 \n" 935 "add %3, %3, %4 \n" 906 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ 907 "lsr %5, %3, #16 \n" \ 908 "add %6, %1, %5, lsl #2 \n" \ 909 "add %3, %3, %4 \n" \ 936 910 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" 937 // clang-format on938 911 939 912 void ScaleARGBFilterCols_NEON(uint8* dst_argb, … … 945 918 int* tmp = dx_offset; 946 919 const uint8* src_tmp = src_argb; 947 int64 x64 = (int64)x; 948 int64 dx64 = (int64)dx; 920 int64 x64 = (int64)x; // NOLINT 921 int64 dx64 = (int64)dx; // NOLINT 949 922 asm volatile ( 950 923 "dup v0.4s, %w3 \n" // x … … 1002 975 #undef LOAD2_DATA32_LANE 1003 976 977 // Read 16x2 average down and write 8x1. 978 void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, 979 ptrdiff_t src_stride, 980 uint16* dst, 981 int dst_width) { 982 asm volatile( 983 // change the stride to row 2 pointer 984 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 985 "1: \n" 986 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc 987 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc 988 "subs %w3, %w3, #8 \n" // 8 processed per loop 989 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent 990 "uaddlp v1.4s, v1.8h \n" 991 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent 992 "uadalp v1.4s, v3.8h \n" 993 "rshrn v0.4h, v0.4s, #2 \n" // round and pack 994 "rshrn2 v0.8h, v1.4s, #2 \n" 995 "st1 {v0.8h}, [%2], #16 \n" 996 "b.gt 1b \n" 997 : "+r"(src_ptr), // %0 998 "+r"(src_stride), // %1 999 "+r"(dst), // %2 1000 "+r"(dst_width) // %3 1001 : 1002 : "v0", "v1", "v2", "v3" // Clobber List 1003 ); 1004 } 1005 1006 // Read 8x2 upsample with filtering and write 16x1. 1007 // Actually reads an extra pixel, so 9x2. 1008 void ScaleRowUp2_16_NEON(const uint16* src_ptr, 1009 ptrdiff_t src_stride, 1010 uint16* dst, 1011 int dst_width) { 1012 asm volatile( 1013 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 1014 "movi v0.8h, #9 \n" // constants 1015 "movi v1.4s, #3 \n" 1016 1017 "1: \n" 1018 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 1019 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 1020 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row 1021 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 1022 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop 1023 "umull v16.4s, v3.4h, v0.4h \n" 1024 "umull2 v7.4s, v3.8h, v0.8h \n" 1025 "umull v18.4s, v4.4h, v0.4h \n" 1026 "umull2 v17.4s, v4.8h, v0.8h \n" 1027 "uaddw v16.4s, v16.4s, v6.4h \n" 1028 "uaddl2 v19.4s, v6.8h, v3.8h \n" 1029 "uaddl v3.4s, v6.4h, v3.4h \n" 1030 "uaddw2 v6.4s, v7.4s, v6.8h \n" 1031 "uaddl2 v7.4s, v5.8h, v4.8h \n" 1032 "uaddl v4.4s, v5.4h, v4.4h \n" 1033 "uaddw v18.4s, v18.4s, v5.4h \n" 1034 "mla v16.4s, v4.4s, v1.4s \n" 1035 "mla v18.4s, v3.4s, v1.4s \n" 1036 "mla v6.4s, v7.4s, v1.4s \n" 1037 "uaddw2 v4.4s, v17.4s, v5.8h \n" 1038 "uqrshrn v16.4h, v16.4s, #4 \n" 1039 "mla v4.4s, v19.4s, v1.4s \n" 1040 "uqrshrn2 v16.8h, v6.4s, #4 \n" 1041 "uqrshrn v17.4h, v18.4s, #4 \n" 1042 "uqrshrn2 v17.8h, v4.4s, #4 \n" 1043 "st2 {v16.8h-v17.8h}, [%2], #32 \n" 1044 "b.gt 1b \n" 1045 : "+r"(src_ptr), // %0 1046 "+r"(src_stride), // %1 1047 "+r"(dst), // %2 1048 "+r"(dst_width) // %3 1049 : "r"(2LL), // %4 1050 "r"(14LL) // %5 1051 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", 1052 "v19" // Clobber List 1053 ); 1054 } 1055 1004 1056 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 1005 1057
Note: See TracChangeset
for help on using the changeset viewer.