Changeset 5633 for pjproject/trunk/third_party/yuv/source/scale_win.cc
- Timestamp:
- Jul 28, 2017 2:51:44 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/scale_win.cc
r5358 r5633 21 21 22 22 // Offsets for source bytes 0 to 9 23 static uvec8 kShuf0 = 24 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128};23 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, 24 128, 128, 128, 128, 128, 128, 128, 128}; 25 25 26 26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 27 static uvec8 kShuf1 = 28 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128};27 static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, 28 128, 128, 128, 128, 128, 128, 128, 128}; 29 29 30 30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 31 static uvec8 kShuf2 = 32 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128};31 static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, 32 128, 128, 128, 128, 128, 128, 128, 128}; 33 33 34 34 // Offsets for source bytes 0 to 10 35 static uvec8 kShuf01 = 36 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 35 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 37 36 38 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 39 static uvec8 kShuf11 = 40 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 38 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; 41 39 42 40 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 43 static uvec8 kShuf21 = 44 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};41 static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, 42 10, 11, 12, 13, 13, 14, 14, 15}; 45 43 46 44 // Coefficients for source bytes 0 to 10 47 static uvec8 kMadd01 = 48 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 45 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; 49 46 50 47 // Coefficients for source bytes 10 to 21 51 static uvec8 kMadd11 = 52 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 48 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; 53 49 54 50 // Coefficients for source bytes 21 to 31 55 static uvec8 kMadd21 = 56 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 51 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; 57 52 58 53 // Coefficients for source bytes 21 to 31 59 static vec16 kRound34 = 60 { 2, 2, 2, 2, 2, 2, 2, 2 }; 61 62 static uvec8 kShuf38a = 63 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 64 65 static uvec8 kShuf38b = 66 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 54 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; 55 56 static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, 57 128, 128, 128, 128, 128, 128, 128, 128}; 58 59 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, 60 6, 8, 11, 14, 128, 128, 128, 128}; 67 61 68 62 // Arrange words 0,3,6 into 0,1,2 69 static uvec8 kShufAc = 70 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};63 static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, 64 128, 128, 128, 128, 128, 128, 128, 128}; 71 65 72 66 // Arrange words 0,3,6 into 3,4,5 73 static uvec8 kShufAc3 = 74 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128};67 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, 68 6, 7, 12, 13, 128, 128, 128, 128}; 75 69 76 70 // Scaling values for boxes of 3x3 and 2x3 77 static uvec16 kScaleAc33 = 78 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0};71 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 72 65536 / 9, 65536 / 6, 0, 0}; 79 73 80 74 // Arrange first value for pixels 0,1,2,3,4,5 81 static uvec8 kShufAb0 = 82 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128};75 static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, 76 11, 128, 14, 128, 128, 128, 128, 128}; 83 77 84 78 // Arrange second value for pixels 0,1,2,3,4,5 85 static uvec8 kShufAb1 = 86 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128};79 static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, 80 12, 128, 15, 128, 128, 128, 128, 128}; 87 81 88 82 // Arrange third value for pixels 0,1,2,3,4,5 89 static uvec8 kShufAb2 = 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128};83 static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, 84 13, 128, 128, 128, 128, 128, 128, 128}; 91 85 92 86 // Scaling values for boxes of 3x2 and 2x2 93 static uvec16 kScaleAb2 = 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0};87 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 88 65536 / 3, 65536 / 2, 0, 0}; 95 89 96 90 // Reads 32 pixels, throws half away and writes 16 pixels. 97 __declspec(naked) 98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 99 uint8* dst_ptr, int dst_width) { 100 __asm { 101 mov eax, [esp + 4] // src_ptr 102 // src_stride ignored 103 mov edx, [esp + 12] // dst_ptr 104 mov ecx, [esp + 16] // dst_width 91 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr, 92 ptrdiff_t src_stride, 93 uint8* dst_ptr, 94 int dst_width) { 95 __asm { 96 mov eax, [esp + 4] // src_ptr 97 // src_stride ignored 98 mov edx, [esp + 12] // dst_ptr 99 mov ecx, [esp + 16] // dst_width 105 100 106 101 wloop: … … 108 103 movdqu xmm1, [eax + 16] 109 104 lea eax, [eax + 32] 110 psrlw xmm0, 8 105 psrlw xmm0, 8 // isolate odd pixels. 111 106 psrlw xmm1, 8 112 107 packuswb xmm0, xmm1 … … 121 116 122 117 // Blends 32x1 rectangle to 16x1. 123 __declspec(naked) 124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 125 uint8* dst_ptr, int dst_width) { 126 __asm { 127 mov eax, [esp + 4] // src_ptr 128 // src_stride 129 mov edx, [esp + 12] // dst_ptr 130 mov ecx, [esp + 16] // dst_width 131 132 pcmpeqb xmm4, xmm4 // constant 0x0101 118 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, 119 ptrdiff_t src_stride, 120 uint8* dst_ptr, 121 int dst_width) { 122 __asm { 123 mov eax, [esp + 4] // src_ptr 124 // src_stride 125 mov edx, [esp + 12] // dst_ptr 126 mov ecx, [esp + 16] // dst_width 127 128 pcmpeqb xmm4, xmm4 // constant 0x0101 133 129 psrlw xmm4, 15 134 130 packuswb xmm4, xmm4 135 pxor xmm5, xmm5 131 pxor xmm5, xmm5 // constant 0 136 132 137 133 wloop: … … 139 135 movdqu xmm1, [eax + 16] 140 136 lea eax, [eax + 32] 141 pmaddubsw xmm0, xmm4 137 pmaddubsw xmm0, xmm4 // horizontal add 142 138 pmaddubsw xmm1, xmm4 143 pavgw xmm0, xmm5 // (x + 1) / 2139 pavgw xmm0, xmm5 // (x + 1) / 2 144 140 pavgw xmm1, xmm5 145 141 packuswb xmm0, xmm1 … … 154 150 155 151 // Blends 32x2 rectangle to 16x1. 156 __declspec(naked) 157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 158 uint8* dst_ptr, int dst_width) { 152 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, 153 ptrdiff_t src_stride, 154 uint8* dst_ptr, 155 int dst_width) { 159 156 __asm { 160 157 push esi 161 mov eax, [esp + 4 + 4] 162 mov esi, [esp + 4 + 8] 163 mov edx, [esp + 4 + 12] 164 mov ecx, [esp + 4 + 16] 165 166 pcmpeqb xmm4, xmm4 158 mov eax, [esp + 4 + 4] // src_ptr 159 mov esi, [esp + 4 + 8] // src_stride 160 mov edx, [esp + 4 + 12] // dst_ptr 161 mov ecx, [esp + 4 + 16] // dst_width 162 163 pcmpeqb xmm4, xmm4 // constant 0x0101 167 164 psrlw xmm4, 15 168 165 packuswb xmm4, xmm4 169 pxor xmm5, xmm5 166 pxor xmm5, xmm5 // constant 0 170 167 171 168 wloop: … … 175 172 movdqu xmm3, [eax + esi + 16] 176 173 lea eax, [eax + 32] 177 pmaddubsw xmm0, xmm4 174 pmaddubsw xmm0, xmm4 // horizontal add 178 175 pmaddubsw xmm1, xmm4 179 176 pmaddubsw xmm2, xmm4 180 177 pmaddubsw xmm3, xmm4 181 paddw xmm0, xmm2 178 paddw xmm0, xmm2 // vertical add 182 179 paddw xmm1, xmm3 183 180 psrlw xmm0, 1 184 181 psrlw xmm1, 1 185 pavgw xmm0, xmm5 182 pavgw xmm0, xmm5 // (x + 1) / 2 186 183 pavgw xmm1, xmm5 187 184 packuswb xmm0, xmm1 … … 198 195 #ifdef HAS_SCALEROWDOWN2_AVX2 199 196 // Reads 64 pixels, throws half away and writes 32 pixels. 200 __declspec(naked) 201 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 202 uint8* dst_ptr, int dst_width) { 203 __asm { 204 mov eax, [esp + 4] // src_ptr 205 // src_stride ignored 206 mov edx, [esp + 12] // dst_ptr 207 mov ecx, [esp + 16] // dst_width 197 __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, 198 ptrdiff_t src_stride, 199 uint8* dst_ptr, 200 int dst_width) { 201 __asm { 202 mov eax, [esp + 4] // src_ptr 203 // src_stride ignored 204 mov edx, [esp + 12] // dst_ptr 205 mov ecx, [esp + 16] // dst_width 208 206 209 207 wloop: … … 211 209 vmovdqu ymm1, [eax + 32] 212 210 lea eax, [eax + 64] 213 vpsrlw ymm0, ymm0, 8 211 vpsrlw ymm0, ymm0, 8 // isolate odd pixels. 214 212 vpsrlw ymm1, ymm1, 8 215 213 vpackuswb ymm0, ymm0, ymm1 216 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb214 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 217 215 vmovdqu [edx], ymm0 218 216 lea edx, [edx + 32] … … 226 224 227 225 // Blends 64x1 rectangle to 32x1. 228 __declspec(naked) 229 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 230 uint8* dst_ptr, int dst_width) { 231 __asm { 232 mov eax, [esp + 4] // src_ptr 233 // src_stride 234 mov edx, [esp + 12] // dst_ptr 235 mov ecx, [esp + 16] // dst_width 236 237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 226 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, 227 ptrdiff_t src_stride, 228 uint8* dst_ptr, 229 int dst_width) { 230 __asm { 231 mov eax, [esp + 4] // src_ptr 232 // src_stride 233 mov edx, [esp + 12] // dst_ptr 234 mov ecx, [esp + 16] // dst_width 235 236 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 238 237 vpsrlw ymm4, ymm4, 15 239 238 vpackuswb ymm4, ymm4, ymm4 240 vpxor ymm5, ymm5, ymm5 239 vpxor ymm5, ymm5, ymm5 // constant 0 241 240 242 241 wloop: … … 244 243 vmovdqu ymm1, [eax + 32] 245 244 lea eax, [eax + 64] 246 vpmaddubsw ymm0, ymm0, ymm4 245 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 247 246 vpmaddubsw ymm1, ymm1, ymm4 248 vpavgw ymm0, ymm0, ymm5 247 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 249 248 vpavgw ymm1, ymm1, ymm5 250 249 vpackuswb ymm0, ymm0, ymm1 251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb250 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 252 251 vmovdqu [edx], ymm0 253 252 lea edx, [edx + 32] … … 263 262 // becomes average((sum >> 1), 0) 264 263 // Blends 64x2 rectangle to 32x1. 265 __declspec(naked) 266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 267 uint8* dst_ptr, int dst_width) { 264 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, 265 ptrdiff_t src_stride, 266 uint8* dst_ptr, 267 int dst_width) { 268 268 __asm { 269 269 push esi 270 mov eax, [esp + 4 + 4] 271 mov esi, [esp + 4 + 8] 272 mov edx, [esp + 4 + 12] 273 mov ecx, [esp + 4 + 16] 274 275 vpcmpeqb ymm4, ymm4, ymm4 270 mov eax, [esp + 4 + 4] // src_ptr 271 mov esi, [esp + 4 + 8] // src_stride 272 mov edx, [esp + 4 + 12] // dst_ptr 273 mov ecx, [esp + 4 + 16] // dst_width 274 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 276 276 vpsrlw ymm4, ymm4, 15 277 277 vpackuswb ymm4, ymm4, ymm4 278 vpxor ymm5, ymm5, ymm5 278 vpxor ymm5, ymm5, ymm5 // constant 0 279 279 280 280 wloop: … … 284 284 vmovdqu ymm3, [eax + esi + 32] 285 285 lea eax, [eax + 64] 286 vpmaddubsw ymm0, ymm0, ymm4 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 287 287 vpmaddubsw ymm1, ymm1, ymm4 288 288 vpmaddubsw ymm2, ymm2, ymm4 289 289 vpmaddubsw ymm3, ymm3, ymm4 290 vpaddw ymm0, ymm0, ymm2 290 vpaddw ymm0, ymm0, ymm2 // vertical add 291 291 vpaddw ymm1, ymm1, ymm3 292 vpsrlw ymm0, ymm0, 1 292 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 293 293 vpsrlw ymm1, ymm1, 1 294 vpavgw ymm0, ymm0, ymm5 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 295 295 vpavgw ymm1, ymm1, ymm5 296 296 vpackuswb ymm0, ymm0, ymm1 297 vpermq ymm0, ymm0, 0xd8 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 298 298 vmovdqu [edx], ymm0 299 299 lea edx, [edx + 32] … … 309 309 310 310 // Point samples 32 pixels to 8 pixels. 311 __declspec(naked) 312 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 313 uint8* dst_ptr, int dst_width) { 314 __asm { 315 mov eax, [esp + 4] // src_ptr 316 // src_stride ignored 317 mov edx, [esp + 12] // dst_ptr 318 mov ecx, [esp + 16] // dst_width 319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 311 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr, 312 ptrdiff_t src_stride, 313 uint8* dst_ptr, 314 int dst_width) { 315 __asm { 316 mov eax, [esp + 4] // src_ptr 317 // src_stride ignored 318 mov edx, [esp + 12] // dst_ptr 319 mov ecx, [esp + 16] // dst_width 320 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 320 321 psrld xmm5, 24 321 322 pslld xmm5, 16 … … 340 341 341 342 // Blends 32x4 rectangle to 8x1. 342 __declspec(naked) 343 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 344 uint8* dst_ptr, int dst_width) { 343 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, 344 ptrdiff_t src_stride, 345 uint8* dst_ptr, 346 int dst_width) { 345 347 __asm { 346 348 push esi 347 349 push edi 348 mov eax, [esp + 8 + 4] 349 mov esi, [esp + 8 + 8] 350 mov edx, [esp + 8 + 12] 351 mov ecx, [esp + 8 + 16] 350 mov eax, [esp + 8 + 4] // src_ptr 351 mov esi, [esp + 8 + 8] // src_stride 352 mov edx, [esp + 8 + 12] // dst_ptr 353 mov ecx, [esp + 8 + 16] // dst_width 352 354 lea edi, [esi + esi * 2] // src_stride * 3 353 pcmpeqb xmm4, xmm4 355 pcmpeqb xmm4, xmm4 // constant 0x0101 354 356 psrlw xmm4, 15 355 357 movdqa xmm5, xmm4 356 358 packuswb xmm4, xmm4 357 psllw xmm5, 3 358 359 wloop: 360 movdqu xmm0, [eax] 359 psllw xmm5, 3 // constant 0x0008 360 361 wloop: 362 movdqu xmm0, [eax] // average rows 361 363 movdqu xmm1, [eax + 16] 362 364 movdqu xmm2, [eax + esi] 363 365 movdqu xmm3, [eax + esi + 16] 364 pmaddubsw xmm0, xmm4 366 pmaddubsw xmm0, xmm4 // horizontal add 365 367 pmaddubsw xmm1, xmm4 366 368 pmaddubsw xmm2, xmm4 367 369 pmaddubsw xmm3, xmm4 368 paddw xmm0, xmm2 370 paddw xmm0, xmm2 // vertical add rows 0, 1 369 371 paddw xmm1, xmm3 370 372 movdqu xmm2, [eax + esi * 2] … … 372 374 pmaddubsw xmm2, xmm4 373 375 pmaddubsw xmm3, xmm4 374 paddw xmm0, xmm2 376 paddw xmm0, xmm2 // add row 2 375 377 paddw xmm1, xmm3 376 378 movdqu xmm2, [eax + edi] … … 379 381 pmaddubsw xmm2, xmm4 380 382 pmaddubsw xmm3, xmm4 381 paddw xmm0, xmm2 383 paddw xmm0, xmm2 // add row 3 382 384 paddw xmm1, xmm3 383 385 phaddw xmm0, xmm1 384 paddw xmm0, xmm5 385 psrlw xmm0, 4 386 paddw xmm0, xmm5 // + 8 for round 387 psrlw xmm0, 4 // /16 for average of 4 * 4 386 388 packuswb xmm0, xmm0 387 389 movq qword ptr [edx], xmm0 … … 398 400 #ifdef HAS_SCALEROWDOWN4_AVX2 399 401 // Point samples 64 pixels to 16 pixels. 400 __declspec(naked) 401 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 402 uint8* dst_ptr, int dst_width) { 403 __asm { 404 mov eax, [esp + 4] // src_ptr 405 // src_stride ignored 406 mov edx, [esp + 12] // dst_ptr 407 mov ecx, [esp + 16] // dst_width 408 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 402 __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr, 403 ptrdiff_t src_stride, 404 uint8* dst_ptr, 405 int dst_width) { 406 __asm { 407 mov eax, [esp + 4] // src_ptr 408 // src_stride ignored 409 mov edx, [esp + 12] // dst_ptr 410 mov ecx, [esp + 16] // dst_width 411 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 409 412 vpsrld ymm5, ymm5, 24 410 413 vpslld ymm5, ymm5, 16 … … 417 420 vpand ymm1, ymm1, ymm5 418 421 vpackuswb ymm0, ymm0, ymm1 419 vpermq ymm0, ymm0, 0xd8 422 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 420 423 vpsrlw ymm0, ymm0, 8 421 424 vpackuswb ymm0, ymm0, ymm0 422 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb425 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 423 426 vmovdqu [edx], xmm0 424 427 lea edx, [edx + 16] … … 432 435 433 436 // Blends 64x4 rectangle to 16x1. 434 __declspec(naked) 435 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 436 uint8* dst_ptr, int dst_width) { 437 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr, 438 ptrdiff_t src_stride, 439 uint8* dst_ptr, 440 int dst_width) { 437 441 __asm { 438 442 push esi 439 443 push edi 440 mov eax, [esp + 8 + 4] 441 mov esi, [esp + 8 + 8] 442 mov edx, [esp + 8 + 12] 443 mov ecx, [esp + 8 + 16] 444 mov eax, [esp + 8 + 4] // src_ptr 445 mov esi, [esp + 8 + 8] // src_stride 446 mov edx, [esp + 8 + 12] // dst_ptr 447 mov ecx, [esp + 8 + 16] // dst_width 444 448 lea edi, [esi + esi * 2] // src_stride * 3 445 vpcmpeqb ymm4, ymm4, ymm4 449 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 446 450 vpsrlw ymm4, ymm4, 15 447 vpsllw ymm5, ymm4, 3 451 vpsllw ymm5, ymm4, 3 // constant 0x0008 448 452 vpackuswb ymm4, ymm4, ymm4 449 453 450 454 wloop: 451 vmovdqu ymm0, [eax] 455 vmovdqu ymm0, [eax] // average rows 452 456 vmovdqu ymm1, [eax + 32] 453 457 vmovdqu ymm2, [eax + esi] 454 458 vmovdqu ymm3, [eax + esi + 32] 455 vpmaddubsw ymm0, ymm0, ymm4 459 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 456 460 vpmaddubsw ymm1, ymm1, ymm4 457 461 vpmaddubsw ymm2, ymm2, ymm4 458 462 vpmaddubsw ymm3, ymm3, ymm4 459 vpaddw ymm0, ymm0, ymm2 463 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 460 464 vpaddw ymm1, ymm1, ymm3 461 465 vmovdqu ymm2, [eax + esi * 2] … … 463 467 vpmaddubsw ymm2, ymm2, ymm4 464 468 vpmaddubsw ymm3, ymm3, ymm4 465 vpaddw ymm0, ymm0, ymm2 469 vpaddw ymm0, ymm0, ymm2 // add row 2 466 470 vpaddw ymm1, ymm1, ymm3 467 471 vmovdqu ymm2, [eax + edi] … … 470 474 vpmaddubsw ymm2, ymm2, ymm4 471 475 vpmaddubsw ymm3, ymm3, ymm4 472 vpaddw ymm0, ymm0, ymm2 476 vpaddw ymm0, ymm0, ymm2 // add row 3 473 477 vpaddw ymm1, ymm1, ymm3 474 vphaddw ymm0, ymm0, ymm1 475 vpermq ymm0, ymm0, 0xd8 476 vpaddw ymm0, ymm0, ymm5 477 vpsrlw ymm0, ymm0, 4 478 vphaddw ymm0, ymm0, ymm1 // mutates 479 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw 480 vpaddw ymm0, ymm0, ymm5 // + 8 for round 481 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 478 482 vpackuswb ymm0, ymm0, ymm0 479 vpermq ymm0, ymm0, 0xd8 483 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 480 484 vmovdqu [edx], xmm0 481 485 lea edx, [edx + 16] … … 495 499 // Then shuffled to do the scaling. 496 500 497 __declspec(naked) 498 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 499 uint8* dst_ptr, int dst_width) { 500 __asm { 501 mov eax, [esp + 4] // src_ptr 502 // src_stride ignored 503 mov edx, [esp + 12] // dst_ptr 504 mov ecx, [esp + 16] // dst_width 501 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, 502 ptrdiff_t src_stride, 503 uint8* dst_ptr, 504 int dst_width) { 505 __asm { 506 mov eax, [esp + 4] // src_ptr 507 // src_stride ignored 508 mov edx, [esp + 12] // dst_ptr 509 mov ecx, [esp + 16] // dst_width 505 510 movdqa xmm3, xmmword ptr kShuf0 506 511 movdqa xmm4, xmmword ptr kShuf1 … … 542 547 543 548 // Note that movdqa+palign may be better than movdqu. 544 __declspec(naked) 545 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,546 ptrdiff_t src_stride,547 uint8* dst_ptr,int dst_width) {549 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 550 ptrdiff_t src_stride, 551 uint8* dst_ptr, 552 int dst_width) { 548 553 __asm { 549 554 push esi 550 mov eax, [esp + 4 + 4] 551 mov esi, [esp + 4 + 8] 552 mov edx, [esp + 4 + 12] 553 mov ecx, [esp + 4 + 16] 555 mov eax, [esp + 4 + 4] // src_ptr 556 mov esi, [esp + 4 + 8] // src_stride 557 mov edx, [esp + 4 + 12] // dst_ptr 558 mov ecx, [esp + 4 + 16] // dst_width 554 559 movdqa xmm2, xmmword ptr kShuf01 555 560 movdqa xmm3, xmmword ptr kShuf11 … … 560 565 561 566 wloop: 562 movdqu xmm0, [eax] 567 movdqu xmm0, [eax] // pixels 0..7 563 568 movdqu xmm1, [eax + esi] 564 569 pavgb xmm0, xmm1 … … 569 574 packuswb xmm0, xmm0 570 575 movq qword ptr [edx], xmm0 571 movdqu xmm0, [eax + 8] 576 movdqu xmm0, [eax + 8] // pixels 8..15 572 577 movdqu xmm1, [eax + esi + 8] 573 578 pavgb xmm0, xmm1 … … 578 583 packuswb xmm0, xmm0 579 584 movq qword ptr [edx + 8], xmm0 580 movdqu xmm0, [eax + 16] 585 movdqu xmm0, [eax + 16] // pixels 16..23 581 586 movdqu xmm1, [eax + esi + 16] 582 587 lea eax, [eax + 32] … … 599 604 600 605 // Note that movdqa+palign may be better than movdqu. 601 __declspec(naked) 602 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,603 ptrdiff_t src_stride,604 uint8* dst_ptr,int dst_width) {606 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 607 ptrdiff_t src_stride, 608 uint8* dst_ptr, 609 int dst_width) { 605 610 __asm { 606 611 push esi 607 mov eax, [esp + 4 + 4] 608 mov esi, [esp + 4 + 8] 609 mov edx, [esp + 4 + 12] 610 mov ecx, [esp + 4 + 16] 612 mov eax, [esp + 4 + 4] // src_ptr 613 mov esi, [esp + 4 + 8] // src_stride 614 mov edx, [esp + 4 + 12] // dst_ptr 615 mov ecx, [esp + 4 + 16] // dst_width 611 616 movdqa xmm2, xmmword ptr kShuf01 612 617 movdqa xmm3, xmmword ptr kShuf11 … … 617 622 618 623 wloop: 619 movdqu xmm0, [eax] 624 movdqu xmm0, [eax] // pixels 0..7 620 625 movdqu xmm1, [eax + esi] 621 626 pavgb xmm1, xmm0 … … 627 632 packuswb xmm0, xmm0 628 633 movq qword ptr [edx], xmm0 629 movdqu xmm0, [eax + 8] 634 movdqu xmm0, [eax + 8] // pixels 8..15 630 635 movdqu xmm1, [eax + esi + 8] 631 636 pavgb xmm1, xmm0 … … 637 642 packuswb xmm0, xmm0 638 643 movq qword ptr [edx + 8], xmm0 639 movdqu xmm0, [eax + 16] 644 movdqu xmm0, [eax + 16] // pixels 16..23 640 645 movdqu xmm1, [eax + esi + 16] 641 646 lea eax, [eax + 32] … … 661 666 662 667 // Scale 32 pixels to 12 663 __declspec(naked) 664 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 665 uint8* dst_ptr, int dst_width) { 666 __asm { 667 mov eax, [esp + 4] // src_ptr 668 // src_stride ignored 669 mov edx, [esp + 12] // dst_ptr 670 mov ecx, [esp + 16] // dst_width 668 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, 669 ptrdiff_t src_stride, 670 uint8* dst_ptr, 671 int dst_width) { 672 __asm { 673 mov eax, [esp + 4] // src_ptr 674 // src_stride ignored 675 mov edx, [esp + 12] // dst_ptr 676 mov ecx, [esp + 16] // dst_width 671 677 movdqa xmm4, xmmword ptr kShuf38a 672 678 movdqa xmm5, xmmword ptr kShuf38b 673 679 674 680 xloop: 675 movdqu xmm0, [eax] 676 movdqu xmm1, [eax + 16] 681 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 682 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 677 683 lea eax, [eax + 32] 678 684 pshufb xmm0, xmm4 … … 680 686 paddusb xmm0, xmm1 681 687 682 movq qword ptr [edx], xmm0 // write 12 pixels688 movq qword ptr [edx], xmm0 // write 12 pixels 683 689 movhlps xmm1, xmm0 684 690 movd [edx + 8], xmm1 … … 692 698 693 699 // Scale 16x3 pixels to 6x1 with interpolation 694 __declspec(naked) 695 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,696 ptrdiff_t src_stride,697 uint8* dst_ptr,int dst_width) {700 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 701 ptrdiff_t src_stride, 702 uint8* dst_ptr, 703 int dst_width) { 698 704 __asm { 699 705 push esi 700 mov eax, [esp + 4 + 4] 701 mov esi, [esp + 4 + 8] 702 mov edx, [esp + 4 + 12] 703 mov ecx, [esp + 4 + 16] 706 mov eax, [esp + 4 + 4] // src_ptr 707 mov esi, [esp + 4 + 8] // src_stride 708 mov edx, [esp + 4 + 12] // dst_ptr 709 mov ecx, [esp + 4 + 16] // dst_width 704 710 movdqa xmm2, xmmword ptr kShufAc 705 711 movdqa xmm3, xmmword ptr kShufAc3 … … 708 714 709 715 xloop: 710 movdqu xmm0, [eax] 716 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 711 717 movdqu xmm6, [eax + esi] 712 718 movhlps xmm1, xmm0 … … 726 732 paddusw xmm1, xmm7 727 733 728 movdqa xmm6, xmm0 734 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 729 735 psrldq xmm0, 2 730 736 paddusw xmm6, xmm0 … … 733 739 pshufb xmm6, xmm2 734 740 735 movdqa xmm7, xmm1 741 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 736 742 psrldq xmm1, 2 737 743 paddusw xmm7, xmm1 … … 741 747 paddusw xmm6, xmm7 742 748 743 pmulhuw xmm6, xmm4 749 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 744 750 packuswb xmm6, xmm6 745 751 746 movd [edx], xmm6 752 movd [edx], xmm6 // write 6 pixels 747 753 psrlq xmm6, 16 748 754 movd [edx + 2], xmm6 … … 757 763 758 764 // Scale 16x2 pixels to 6x1 with interpolation 759 __declspec(naked) 760 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,761 ptrdiff_t src_stride,762 uint8* dst_ptr,int dst_width) {765 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 766 ptrdiff_t src_stride, 767 uint8* dst_ptr, 768 int dst_width) { 763 769 __asm { 764 770 push esi 765 mov eax, [esp + 4 + 4] 766 mov esi, [esp + 4 + 8] 767 mov edx, [esp + 4 + 12] 768 mov ecx, [esp + 4 + 16] 771 mov eax, [esp + 4 + 4] // src_ptr 772 mov esi, [esp + 4 + 8] // src_stride 773 mov edx, [esp + 4 + 12] // dst_ptr 774 mov ecx, [esp + 4 + 16] // dst_width 769 775 movdqa xmm2, xmmword ptr kShufAb0 770 776 movdqa xmm3, xmmword ptr kShufAb1 … … 773 779 774 780 xloop: 775 movdqu xmm0, [eax] 781 movdqu xmm0, [eax] // average 2 rows into xmm0 776 782 movdqu xmm1, [eax + esi] 777 783 lea eax, [eax + 16] 778 784 pavgb xmm0, xmm1 779 785 780 movdqa xmm1, xmm0 786 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 781 787 pshufb xmm1, xmm2 782 788 movdqa xmm6, xmm0 … … 786 792 paddusw xmm1, xmm0 787 793 788 pmulhuw xmm1, xmm5 794 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 789 795 packuswb xmm1, xmm1 790 796 791 movd [edx], xmm1 797 movd [edx], xmm1 // write 6 pixels 792 798 psrlq xmm1, 16 793 799 movd [edx + 2], xmm1 … … 802 808 803 809 // Reads 16 bytes and accumulates to 16 shorts at a time. 804 __declspec(naked) 805 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 806 __asm { 807 mov eax, [esp + 4] // src_ptr 808 mov edx, [esp + 8] // dst_ptr 810 __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, 811 uint16* dst_ptr, 812 int src_width) { 813 __asm { 814 mov eax, [esp + 4] // src_ptr 815 mov edx, [esp + 8] // dst_ptr 809 816 mov ecx, [esp + 12] // src_width 810 817 pxor xmm5, xmm5 811 818 812 // sum rows819 // sum rows 813 820 xloop: 814 movdqu xmm3, [eax] 821 movdqu xmm3, [eax] // read 16 bytes 815 822 lea eax, [eax + 16] 816 movdqu xmm0, [edx] 823 movdqu xmm0, [edx] // read 16 words from destination 817 824 movdqu xmm1, [edx + 16] 818 825 movdqa xmm2, xmm3 819 826 punpcklbw xmm2, xmm5 820 827 punpckhbw xmm3, xmm5 821 paddusw xmm0, xmm2 828 paddusw xmm0, xmm2 // sum 16 words 822 829 paddusw xmm1, xmm3 823 movdqu [edx], xmm0 830 movdqu [edx], xmm0 // write 16 words to destination 824 831 movdqu [edx + 16], xmm1 825 832 lea edx, [edx + 32] … … 832 839 #ifdef HAS_SCALEADDROW_AVX2 833 840 // Reads 32 bytes and accumulates to 32 shorts at a time. 834 __declspec(naked) 835 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 836 __asm { 837 mov eax, [esp + 4] // src_ptr 838 mov edx, [esp + 8] // dst_ptr 841 __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, 842 uint16* dst_ptr, 843 int src_width) { 844 __asm { 845 mov eax, [esp + 4] // src_ptr 846 mov edx, [esp + 8] // dst_ptr 839 847 mov ecx, [esp + 12] // src_width 840 848 vpxor ymm5, ymm5, ymm5 841 849 842 // sum rows850 // sum rows 843 851 xloop: 844 vmovdqu ymm3, [eax] 852 vmovdqu ymm3, [eax] // read 32 bytes 845 853 lea eax, [eax + 32] 846 854 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck 847 855 vpunpcklbw ymm2, ymm3, ymm5 848 856 vpunpckhbw ymm3, ymm3, ymm5 849 vpaddusw ymm0, ymm2, [edx] // sum 16 words857 vpaddusw ymm0, ymm2, [edx] // sum 16 words 850 858 vpaddusw ymm1, ymm3, [edx + 32] 851 vmovdqu [edx], ymm0 859 vmovdqu [edx], ymm0 // write 32 words to destination 852 860 vmovdqu [edx + 32], ymm1 853 861 lea edx, [edx + 64] … … 861 869 #endif // HAS_SCALEADDROW_AVX2 862 870 871 // Constant for making pixels signed to avoid pmaddubsw 872 // saturation. 873 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 874 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 875 876 // Constant for making pixels unsigned and adding .5 for rounding. 877 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 878 0x4040, 0x4040, 0x4040, 0x4040}; 879 863 880 // Bilinear column filtering. SSSE3 version. 864 __declspec(naked) 865 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 866 int dst_width, int x, int dx) { 881 __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, 882 const uint8* src_ptr, 883 int dst_width, 884 int x, 885 int dx) { 867 886 __asm { 868 887 push ebx 869 888 push esi 870 889 push edi 871 mov edi, [esp + 12 + 4] 872 mov esi, [esp + 12 + 8] 873 mov ecx, [esp + 12 + 12] 890 mov edi, [esp + 12 + 4] // dst_ptr 891 mov esi, [esp + 12 + 8] // src_ptr 892 mov ecx, [esp + 12 + 12] // dst_width 874 893 movd xmm2, [esp + 12 + 16] // x 875 894 movd xmm3, [esp + 12 + 20] // dx 876 mov eax, 0x04040000 895 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 877 896 movd xmm5, eax 878 pcmpeqb xmm6, xmm6 897 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 879 898 psrlw xmm6, 9 880 pextrw eax, xmm2, 1 // get x0 integer. preroll 899 pcmpeqb xmm7, xmm7 // generate 0x0001 900 psrlw xmm7, 15 901 pextrw eax, xmm2, 1 // get x0 integer. preroll 881 902 sub ecx, 2 882 903 jl xloop29 883 904 884 movdqa xmm0, xmm2 905 movdqa xmm0, xmm2 // x1 = x0 + dx 885 906 paddd xmm0, xmm3 886 punpckldq xmm2, xmm0 887 punpckldq xmm3, xmm3 888 paddd xmm3, xmm3 889 pextrw edx, xmm2, 3 907 punpckldq xmm2, xmm0 // x0 x1 908 punpckldq xmm3, xmm3 // dx dx 909 paddd xmm3, xmm3 // dx * 2, dx * 2 910 pextrw edx, xmm2, 3 // get x1 integer. preroll 890 911 891 912 // 2 Pixel loop. 892 913 xloop2: 893 movdqa xmm1, xmm2 894 paddd xmm2, xmm3 914 movdqa xmm1, xmm2 // x0, x1 fractions. 915 paddd xmm2, xmm3 // x += dx 895 916 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 896 917 movd xmm0, ebx 897 psrlw xmm1, 9 918 psrlw xmm1, 9 // 7 bit fractions. 898 919 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 899 920 movd xmm4, ebx 900 pshufb xmm1, xmm5 921 pshufb xmm1, xmm5 // 0011 901 922 punpcklwd xmm0, xmm4 902 pxor xmm1, xmm6 // 0..7f and 7f..0 903 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 904 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 905 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 906 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 907 packuswb xmm0, xmm0 // 8 bits, 2 pixels. 908 movd ebx, xmm0 923 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 924 pxor xmm1, xmm6 // 0..7f and 7f..0 925 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 926 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. 927 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 928 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 929 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. 930 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. 931 packuswb xmm1, xmm1 // 8 bits, 2 pixels. 932 movd ebx, xmm1 909 933 mov [edi], bx 910 934 lea edi, [edi + 2] 911 sub ecx, 2 935 sub ecx, 2 // 2 pixels 912 936 jge xloop2 913 937 914 938 xloop29: 915 916 939 add ecx, 2 - 1 917 940 jl xloop99 918 941 919 // 1 pixel remainder942 // 1 pixel remainder 920 943 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 921 944 movd xmm0, ebx 922 psrlw xmm2, 9 // 7 bit fractions. 923 pshufb xmm2, xmm5 // 0011 924 pxor xmm2, xmm6 // 0..7f and 7f..0 925 pmaddubsw xmm0, xmm2 // 16 bit 926 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 927 packuswb xmm0, xmm0 // 8 bits 928 movd ebx, xmm0 945 psrlw xmm2, 9 // 7 bit fractions. 946 pshufb xmm2, xmm5 // 0011 947 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 948 pxor xmm2, xmm6 // 0..7f and 7f..0 949 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 950 pmaddubsw xmm2, xmm0 // 16 bit 951 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. 952 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. 953 packuswb xmm2, xmm2 // 8 bits 954 movd ebx, xmm2 929 955 mov [edi], bl 930 956 … … 939 965 940 966 // Reads 16 pixels, duplicates them and writes 32 pixels. 941 __declspec(naked) 942 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 943 int dst_width, int x, int dx) { 944 __asm { 945 mov edx, [esp + 4] // dst_ptr 946 mov eax, [esp + 8] // src_ptr 947 mov ecx, [esp + 12] // dst_width 967 __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, 968 const uint8* src_ptr, 969 int dst_width, 970 int x, 971 int dx) { 972 __asm { 973 mov edx, [esp + 4] // dst_ptr 974 mov eax, [esp + 8] // src_ptr 975 mov ecx, [esp + 12] // dst_width 948 976 949 977 wloop: … … 964 992 965 993 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 966 __declspec(naked) 967 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,968 ptrdiff_t src_stride,969 uint8* dst_argb,int dst_width) {970 __asm { 971 mov eax, [esp + 4] 972 973 mov edx, [esp + 12] 974 mov ecx, [esp + 16] 994 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 995 ptrdiff_t src_stride, 996 uint8* dst_argb, 997 int dst_width) { 998 __asm { 999 mov eax, [esp + 4] // src_argb 1000 // src_stride ignored 1001 mov edx, [esp + 12] // dst_argb 1002 mov ecx, [esp + 16] // dst_width 975 1003 976 1004 wloop: … … 989 1017 990 1018 // Blends 8x1 rectangle to 4x1. 991 __declspec(naked) 992 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,993 ptrdiff_t src_stride,994 uint8* dst_argb,int dst_width) {995 __asm { 996 mov eax, [esp + 4] 997 998 mov edx, [esp + 12] 999 mov ecx, [esp + 16] 1019 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1020 ptrdiff_t src_stride, 1021 uint8* dst_argb, 1022 int dst_width) { 1023 __asm { 1024 mov eax, [esp + 4] // src_argb 1025 // src_stride ignored 1026 mov edx, [esp + 12] // dst_argb 1027 mov ecx, [esp + 16] // dst_width 1000 1028 1001 1029 wloop: … … 1004 1032 lea eax, [eax + 32] 1005 1033 movdqa xmm2, xmm0 1006 shufps xmm0, xmm1, 0x88 1007 shufps xmm2, xmm1, 0xdd // odd pixels1034 shufps xmm0, xmm1, 0x88 // even pixels 1035 shufps xmm2, xmm1, 0xdd // odd pixels 1008 1036 pavgb xmm0, xmm2 1009 1037 movdqu [edx], xmm0 … … 1017 1045 1018 1046 // Blends 8x2 rectangle to 4x1. 1019 __declspec(naked) 1020 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,1021 ptrdiff_t src_stride,1022 uint8* dst_argb,int dst_width) {1047 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1048 ptrdiff_t src_stride, 1049 uint8* dst_argb, 1050 int dst_width) { 1023 1051 __asm { 1024 1052 push esi 1025 mov eax, [esp + 4 + 4] 1026 mov esi, [esp + 4 + 8] 1027 mov edx, [esp + 4 + 12] 1028 mov ecx, [esp + 4 + 16] 1053 mov eax, [esp + 4 + 4] // src_argb 1054 mov esi, [esp + 4 + 8] // src_stride 1055 mov edx, [esp + 4 + 12] // dst_argb 1056 mov ecx, [esp + 4 + 16] // dst_width 1029 1057 1030 1058 wloop: … … 1034 1062 movdqu xmm3, [eax + esi + 16] 1035 1063 lea eax, [eax + 32] 1036 pavgb xmm0, xmm2 1064 pavgb xmm0, xmm2 // average rows 1037 1065 pavgb xmm1, xmm3 1038 movdqa xmm2, xmm0 1039 shufps xmm0, xmm1, 0x88 1040 shufps xmm2, xmm1, 0xdd 1066 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1067 shufps xmm0, xmm1, 0x88 // even pixels 1068 shufps xmm2, xmm1, 0xdd // odd pixels 1041 1069 pavgb xmm0, xmm2 1042 1070 movdqu [edx], xmm0 … … 1051 1079 1052 1080 // Reads 4 pixels at a time. 1053 __declspec(naked) 1054 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1055 int src_stepx, 1056 uint8* dst_argb, int dst_width) { 1081 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, 1082 ptrdiff_t src_stride, 1083 int src_stepx, 1084 uint8* dst_argb, 1085 int dst_width) { 1057 1086 __asm { 1058 1087 push ebx 1059 1088 push edi 1060 mov eax, [esp + 8 + 4] 1061 1062 mov ebx, [esp + 8 + 12] 1063 mov edx, [esp + 8 + 16] 1064 mov ecx, [esp + 8 + 20] 1089 mov eax, [esp + 8 + 4] // src_argb 1090 // src_stride ignored 1091 mov ebx, [esp + 8 + 12] // src_stepx 1092 mov edx, [esp + 8 + 16] // dst_argb 1093 mov ecx, [esp + 8 + 20] // dst_width 1065 1094 lea ebx, [ebx * 4] 1066 1095 lea edi, [ebx + ebx * 2] … … 1087 1116 1088 1117 // Blends four 2x2 to 4x1. 1089 __declspec(naked) 1090 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,1091 ptrdiff_t src_stride,1092 int src_stepx,1093 uint8* dst_argb,int dst_width) {1118 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1119 ptrdiff_t src_stride, 1120 int src_stepx, 1121 uint8* dst_argb, 1122 int dst_width) { 1094 1123 __asm { 1095 1124 push ebx 1096 1125 push esi 1097 1126 push edi 1098 mov eax, [esp + 12 + 4] 1099 mov esi, [esp + 12 + 8] 1100 mov ebx, [esp + 12 + 12] 1101 mov edx, [esp + 12 + 16] 1102 mov ecx, [esp + 12 + 20] 1103 lea esi, [eax + esi] 1127 mov eax, [esp + 12 + 4] // src_argb 1128 mov esi, [esp + 12 + 8] // src_stride 1129 mov ebx, [esp + 12 + 12] // src_stepx 1130 mov edx, [esp + 12 + 16] // dst_argb 1131 mov ecx, [esp + 12 + 20] // dst_width 1132 lea esi, [eax + esi] // row1 pointer 1104 1133 lea ebx, [ebx * 4] 1105 1134 lea edi, [ebx + ebx * 2] … … 1116 1145 movhps xmm3, qword ptr [esi + edi] 1117 1146 lea esi, [esi + ebx * 4] 1118 pavgb xmm0, xmm2 1147 pavgb xmm0, xmm2 // average rows 1119 1148 pavgb xmm1, xmm3 1120 movdqa xmm2, xmm0 1121 shufps xmm0, xmm1, 0x88 1122 shufps xmm2, xmm1, 0xdd 1149 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1150 shufps xmm0, xmm1, 0x88 // even pixels 1151 shufps xmm2, xmm1, 0xdd // odd pixels 1123 1152 pavgb xmm0, xmm2 1124 1153 movdqu [edx], xmm0 … … 1135 1164 1136 1165 // Column scaling unfiltered. SSE2 version. 1137 __declspec(naked) 1138 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1139 int dst_width, int x, int dx) { 1166 __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, 1167 const uint8* src_argb, 1168 int dst_width, 1169 int x, 1170 int dx) { 1140 1171 __asm { 1141 1172 push edi 1142 1173 push esi 1143 mov edi, [esp + 8 + 4] 1144 mov esi, [esp + 8 + 8] 1145 mov ecx, [esp + 8 + 12] 1174 mov edi, [esp + 8 + 4] // dst_argb 1175 mov esi, [esp + 8 + 8] // src_argb 1176 mov ecx, [esp + 8 + 12] // dst_width 1146 1177 movd xmm2, [esp + 8 + 16] // x 1147 1178 movd xmm3, [esp + 8 + 20] // dx 1148 1179 1149 pshufd xmm2, xmm2, 0 1150 pshufd xmm0, xmm3, 0x11 1180 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1181 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1151 1182 paddd xmm2, xmm0 1152 paddd xmm3, xmm3 1153 pshufd xmm0, xmm3, 0x05 1154 paddd xmm2, xmm0 1155 paddd xmm3, xmm3 1156 pshufd xmm3, xmm3, 0 1157 1158 pextrw eax, xmm2, 1 1159 pextrw edx, xmm2, 3 1183 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1184 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1185 paddd xmm2, xmm0 // x3 x2 x1 x0 1186 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1187 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1188 1189 pextrw eax, xmm2, 1 // get x0 integer. 1190 pextrw edx, xmm2, 3 // get x1 integer. 1160 1191 1161 1192 cmp ecx, 0 … … 1168 1199 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1169 1200 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1170 pextrw eax, xmm2, 5 1171 pextrw edx, xmm2, 7 1172 paddd xmm2, xmm3 1173 punpckldq xmm0, xmm1 1201 pextrw eax, xmm2, 5 // get x2 integer. 1202 pextrw edx, xmm2, 7 // get x3 integer. 1203 paddd xmm2, xmm3 // x += dx 1204 punpckldq xmm0, xmm1 // x0 x1 1174 1205 1175 1206 movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1176 1207 movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1177 pextrw eax, xmm2, 1 1178 pextrw edx, xmm2, 3 1179 punpckldq xmm1, xmm4 1180 punpcklqdq xmm0, xmm1 1208 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1209 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1210 punpckldq xmm1, xmm4 // x2 x3 1211 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1181 1212 movdqu [edi], xmm0 1182 1213 lea edi, [edi + 16] 1183 sub ecx, 4 1214 sub ecx, 4 // 4 pixels 1184 1215 jge xloop4 1185 1216 … … 1191 1222 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1192 1223 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1193 pextrw eax, xmm2, 5 1194 punpckldq xmm0, xmm1 1224 pextrw eax, xmm2, 5 // get x2 integer. 1225 punpckldq xmm0, xmm1 // x0 x1 1195 1226 1196 1227 movq qword ptr [edi], xmm0 … … 1217 1248 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1218 1249 static uvec8 kShuffleColARGB = { 1219 0u, 4u, 1u, 5u, 2u, 6u, 3u,7u, // bbggrraa 1st pixel1220 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel1250 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1251 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1221 1252 }; 1222 1253 1223 1254 // Shuffle table for duplicating 2 fractions into 8 bytes each 1224 1255 static uvec8 kShuffleFractions = { 1225 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,1256 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1226 1257 }; 1227 1258 1228 __declspec(naked) 1229 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1230 int dst_width, int x, int dx) { 1259 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, 1260 const uint8* src_argb, 1261 int dst_width, 1262 int x, 1263 int dx) { 1231 1264 __asm { 1232 1265 push esi 1233 1266 push edi 1234 mov edi, [esp + 8 + 4] 1235 mov esi, [esp + 8 + 8] 1236 mov ecx, [esp + 8 + 12] 1267 mov edi, [esp + 8 + 4] // dst_argb 1268 mov esi, [esp + 8 + 8] // src_argb 1269 mov ecx, [esp + 8 + 12] // dst_width 1237 1270 movd xmm2, [esp + 8 + 16] // x 1238 1271 movd xmm3, [esp + 8 + 20] // dx 1239 1272 movdqa xmm4, xmmword ptr kShuffleColARGB 1240 1273 movdqa xmm5, xmmword ptr kShuffleFractions 1241 pcmpeqb xmm6, xmm6 1274 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1242 1275 psrlw xmm6, 9 1243 pextrw eax, xmm2, 1 1276 pextrw eax, xmm2, 1 // get x0 integer. preroll 1244 1277 sub ecx, 2 1245 1278 jl xloop29 1246 1279 1247 movdqa xmm0, xmm2 1280 movdqa xmm0, xmm2 // x1 = x0 + dx 1248 1281 paddd xmm0, xmm3 1249 punpckldq xmm2, xmm0 1250 punpckldq xmm3, xmm3 1251 paddd xmm3, xmm3 1252 pextrw edx, xmm2, 3 1282 punpckldq xmm2, xmm0 // x0 x1 1283 punpckldq xmm3, xmm3 // dx dx 1284 paddd xmm3, xmm3 // dx * 2, dx * 2 1285 pextrw edx, xmm2, 3 // get x1 integer. preroll 1253 1286 1254 1287 // 2 Pixel loop. 1255 1288 xloop2: 1256 movdqa xmm1, xmm2 1257 paddd xmm2, xmm3 1289 movdqa xmm1, xmm2 // x0, x1 fractions. 1290 paddd xmm2, xmm3 // x += dx 1258 1291 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1259 psrlw xmm1, 9 1292 psrlw xmm1, 9 // 7 bit fractions. 1260 1293 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1261 pshufb xmm1, xmm5 1262 pshufb xmm0, xmm4 1263 pxor xmm1, xmm6 1264 pmaddubsw xmm0, xmm1 1265 pextrw eax, xmm2, 1 1266 pextrw edx, xmm2, 3 1267 psrlw xmm0, 7 1268 packuswb xmm0, xmm0 1294 pshufb xmm1, xmm5 // 0000000011111111 1295 pshufb xmm0, xmm4 // arrange pixels into pairs 1296 pxor xmm1, xmm6 // 0..7f and 7f..0 1297 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1298 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1299 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1300 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1301 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1269 1302 movq qword ptr [edi], xmm0 1270 1303 lea edi, [edi + 8] 1271 sub ecx, 2 1304 sub ecx, 2 // 2 pixels 1272 1305 jge xloop2 1273 1306 … … 1277 1310 jl xloop99 1278 1311 1279 // 1 pixel remainder1280 psrlw xmm2, 9 1312 // 1 pixel remainder 1313 psrlw xmm2, 9 // 7 bit fractions. 1281 1314 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1282 pshufb xmm2, xmm5 1283 pshufb xmm0, xmm4 1284 pxor xmm2, xmm6 1285 pmaddubsw xmm0, xmm2 1315 pshufb xmm2, xmm5 // 00000000 1316 pshufb xmm0, xmm4 // arrange pixels into pairs 1317 pxor xmm2, xmm6 // 0..7f and 7f..0 1318 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1286 1319 psrlw xmm0, 7 1287 packuswb xmm0, xmm0 1320 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1288 1321 movd [edi], xmm0 1289 1322 … … 1297 1330 1298 1331 // Reads 4 pixels, duplicates them and writes 8 pixels. 1299 __declspec(naked) 1300 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1301 int dst_width, int x, int dx) { 1302 __asm { 1303 mov edx, [esp + 4] // dst_argb 1304 mov eax, [esp + 8] // src_argb 1305 mov ecx, [esp + 12] // dst_width 1332 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, 1333 const uint8* src_argb, 1334 int dst_width, 1335 int x, 1336 int dx) { 1337 __asm { 1338 mov edx, [esp + 4] // dst_argb 1339 mov eax, [esp + 8] // src_argb 1340 mov ecx, [esp + 12] // dst_width 1306 1341 1307 1342 wloop: … … 1322 1357 1323 1358 // Divide num by div and return as 16.16 fixed point result. 1324 __declspec(naked) 1325 int FixedDiv_X86(int num, int div) { 1326 __asm { 1327 mov eax, [esp + 4] // num 1328 cdq // extend num to 64 bits 1329 shld edx, eax, 16 // 32.16 1359 __declspec(naked) int FixedDiv_X86(int num, int div) { 1360 __asm { 1361 mov eax, [esp + 4] // num 1362 cdq // extend num to 64 bits 1363 shld edx, eax, 16 // 32.16 1330 1364 shl eax, 16 1331 1365 idiv dword ptr [esp + 8] … … 1335 1369 1336 1370 // Divide num by div and return as 16.16 fixed point result. 1337 __declspec(naked) 1338 int FixedDiv1_X86(int num, int div) { 1339 __asm { 1340 mov eax, [esp + 4] // num 1341 mov ecx, [esp + 8] // denom 1342 cdq // extend num to 64 bits 1343 shld edx, eax, 16 // 32.16 1371 __declspec(naked) int FixedDiv1_X86(int num, int div) { 1372 __asm { 1373 mov eax, [esp + 4] // num 1374 mov ecx, [esp + 8] // denom 1375 cdq // extend num to 64 bits 1376 shld edx, eax, 16 // 32.16 1344 1377 shl eax, 16 1345 1378 sub eax, 0x00010001
Note: See TracChangeset
for help on using the changeset viewer.