Changeset 5699 for pjproject/trunk/third_party/yuv/source/compare_win.cc
- Timestamp:
- Nov 21, 2017 9:25:11 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/compare_win.cc
r5358 r5699 10 10 11 11 #include "libyuv/basic_types.h" 12 13 #include "libyuv/compare_row.h" 12 14 #include "libyuv/row.h" 15 16 #if defined(_MSC_VER) 17 #include <intrin.h> // For __popcnt 18 #endif 13 19 14 20 #ifdef __cplusplus … … 17 23 #endif 18 24 25 // This module is for 32 bit Visual C x86 and clangcl 19 26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 #if (_MSC_VER >= 1900) 21 __declspec(naked) 22 #else 23 __declspec(naked) __declspec(align(16)) 24 #endif 25 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 26 __asm { 27 mov eax, [esp + 4] // src_a 28 mov edx, [esp + 8] // src_b 29 mov ecx, [esp + 12] // count 27 28 uint32 HammingDistance_SSE42(const uint8* src_a, 29 const uint8* src_b, 30 int count) { 31 uint32 diff = 0u; 32 33 int i; 34 for (i = 0; i < count - 3; i += 4) { 35 uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b); 36 src_a += 4; 37 src_b += 4; 38 diff += __popcnt(x); 39 } 40 return diff; 41 } 42 43 __declspec(naked) uint32 44 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 45 __asm { 46 mov eax, [esp + 4] // src_a 47 mov edx, [esp + 8] // src_b 48 mov ecx, [esp + 12] // count 30 49 pxor xmm0, xmm0 31 50 pxor xmm5, xmm5 32 51 33 align 4 34 wloop: 35 movdqa xmm1, [eax] 52 wloop: 53 movdqu xmm1, [eax] 36 54 lea eax, [eax + 16] 37 movdq axmm2, [edx]55 movdqu xmm2, [edx] 38 56 lea edx, [edx + 16] 39 sub ecx, 1640 57 movdqa xmm3, xmm1 // abs trick 41 58 psubusb xmm1, xmm2 … … 49 66 paddd xmm0, xmm1 50 67 paddd xmm0, xmm2 68 sub ecx, 16 51 69 jg wloop 52 70 … … 63 81 #if _MSC_VER >= 1700 64 82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 65 #pragma warning(disable: 4752) 66 #if (_MSC_VER >= 1900) 67 __declspec(naked) 68 #else 69 __declspec(naked) __declspec(align(16)) 70 #endif 71 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 72 __asm { 73 mov eax, [esp + 4] // src_a 74 mov edx, [esp + 8] // src_b 75 mov ecx, [esp + 12] // count 83 #pragma warning(disable : 4752) 84 __declspec(naked) uint32 85 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 86 __asm { 87 mov eax, [esp + 4] // src_a 88 mov edx, [esp + 8] // src_b 89 mov ecx, [esp + 12] // count 76 90 vpxor ymm0, ymm0, ymm0 // sum 77 91 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 78 92 sub edx, eax 79 93 80 align 481 94 wloop: 82 95 vmovdqu ymm1, [eax] 83 96 vmovdqu ymm2, [eax + edx] 84 97 lea eax, [eax + 32] 85 sub ecx, 3286 98 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 87 99 vpsubusb ymm2, ymm2, ymm1 … … 93 105 vpaddd ymm0, ymm0, ymm1 94 106 vpaddd ymm0, ymm0, ymm2 107 sub ecx, 32 95 108 jg wloop 96 109 … … 108 121 #endif // _MSC_VER >= 1700 109 122 110 #define HAS_HASHDJB2_SSE41 111 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 112 static uvec32 kHashMul0 = { 113 0x0c3525e1, // 33 ^ 15 114 0xa3476dc1, // 33 ^ 14 115 0x3b4039a1, // 33 ^ 13 116 0x4f5f0981, // 33 ^ 12 117 }; 118 static uvec32 kHashMul1 = { 119 0x30f35d61, // 33 ^ 11 120 0x855cb541, // 33 ^ 10 121 0x040a9121, // 33 ^ 9 122 0x747c7101, // 33 ^ 8 123 }; 124 static uvec32 kHashMul2 = { 125 0xec41d4e1, // 33 ^ 7 126 0x4cfa3cc1, // 33 ^ 6 127 0x025528a1, // 33 ^ 5 128 0x00121881, // 33 ^ 4 129 }; 130 static uvec32 kHashMul3 = { 131 0x00008c61, // 33 ^ 3 132 0x00000441, // 33 ^ 2 133 0x00000021, // 33 ^ 1 134 0x00000001, // 33 ^ 0 135 }; 136 137 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 138 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 139 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 140 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 141 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 142 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 143 _asm _emit 0x40 _asm _emit reg 144 145 #if (_MSC_VER >= 1900) 146 __declspec(naked) 147 #else 148 __declspec(naked) __declspec(align(16)) 149 #endif 150 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 124 uvec32 kHashMul0 = { 125 0x0c3525e1, // 33 ^ 15 126 0xa3476dc1, // 33 ^ 14 127 0x3b4039a1, // 33 ^ 13 128 0x4f5f0981, // 33 ^ 12 129 }; 130 uvec32 kHashMul1 = { 131 0x30f35d61, // 33 ^ 11 132 0x855cb541, // 33 ^ 10 133 0x040a9121, // 33 ^ 9 134 0x747c7101, // 33 ^ 8 135 }; 136 uvec32 kHashMul2 = { 137 0xec41d4e1, // 33 ^ 7 138 0x4cfa3cc1, // 33 ^ 6 139 0x025528a1, // 33 ^ 5 140 0x00121881, // 33 ^ 4 141 }; 142 uvec32 kHashMul3 = { 143 0x00008c61, // 33 ^ 3 144 0x00000441, // 33 ^ 2 145 0x00000021, // 33 ^ 1 146 0x00000001, // 33 ^ 0 147 }; 148 149 __declspec(naked) uint32 150 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 154 154 movd xmm0, [esp + 12] // seed 155 155 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, kHash16x33 158 159 align 4 160 wloop: 161 movdqu xmm1, [eax] // src[0-15] 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, xmmword ptr kHash16x33 158 159 wloop: 160 movdqu xmm1, [eax] // src[0-15] 162 161 lea eax, [eax + 16] 163 pmulld (0xc6) // pmulld xmm0,xmm6hash *= 33 ^ 16164 movdqa xmm5, kHashMul0162 pmulld xmm0, xmm6 // hash *= 33 ^ 16 163 movdqa xmm5, xmmword ptr kHashMul0 165 164 movdqa xmm2, xmm1 166 punpcklbw xmm2, xmm7 165 punpcklbw xmm2, xmm7 // src[0-7] 167 166 movdqa xmm3, xmm2 168 punpcklwd xmm3, xmm7 169 pmulld (0xdd) // pmulldxmm3, xmm5170 movdqa xmm5, kHashMul1167 punpcklwd xmm3, xmm7 // src[0-3] 168 pmulld xmm3, xmm5 169 movdqa xmm5, xmmword ptr kHashMul1 171 170 movdqa xmm4, xmm2 172 punpckhwd xmm4, xmm7 173 pmulld (0xe5) // pmulldxmm4, xmm5174 movdqa xmm5, kHashMul2175 punpckhbw xmm1, xmm7 171 punpckhwd xmm4, xmm7 // src[4-7] 172 pmulld xmm4, xmm5 173 movdqa xmm5, xmmword ptr kHashMul2 174 punpckhbw xmm1, xmm7 // src[8-15] 176 175 movdqa xmm2, xmm1 177 punpcklwd xmm2, xmm7 178 pmulld (0xd5) // pmulldxmm2, xmm5179 movdqa xmm5, kHashMul3180 punpckhwd xmm1, xmm7 181 pmulld (0xcd) // pmulldxmm1, xmm5182 paddd xmm3, xmm4 176 punpcklwd xmm2, xmm7 // src[8-11] 177 pmulld xmm2, xmm5 178 movdqa xmm5, xmmword ptr kHashMul3 179 punpckhwd xmm1, xmm7 // src[12-15] 180 pmulld xmm1, xmm5 181 paddd xmm3, xmm4 // add 16 results 183 182 paddd xmm1, xmm2 184 sub ecx, 16185 183 paddd xmm1, xmm3 186 184 … … 190 188 paddd xmm1, xmm2 191 189 paddd xmm0, xmm1 192 jg wloop 193 194 movd eax, xmm0 // return hash 190 sub ecx, 16 191 jg wloop 192 193 movd eax, xmm0 // return hash 195 194 ret 196 195 } … … 199 198 // Visual C 2012 required for AVX2. 200 199 #if _MSC_VER >= 1700 201 #if (_MSC_VER >= 1900) 202 __declspec(naked) 203 #else 204 __declspec(naked) __declspec(align(16)) 205 #endif 206 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 207 __asm { 208 mov eax, [esp + 4] // src 209 mov ecx, [esp + 8] // count 210 movd xmm0, [esp + 12] // seed 211 movdqa xmm6, kHash16x33 212 213 align 4 214 wloop: 215 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] 216 pmulld xmm0, xmm6 // hash *= 33 ^ 16 217 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] 218 pmulld xmm3, kHashMul0 219 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] 220 pmulld xmm4, kHashMul1 221 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] 222 pmulld xmm2, kHashMul2 200 __declspec(naked) uint32 201 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 202 __asm { 203 mov eax, [esp + 4] // src 204 mov ecx, [esp + 8] // count 205 vmovd xmm0, [esp + 12] // seed 206 207 wloop: 208 vpmovzxbd xmm3, [eax] // src[0-3] 209 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 210 vpmovzxbd xmm4, [eax + 4] // src[4-7] 211 vpmulld xmm3, xmm3, xmmword ptr kHashMul0 212 vpmovzxbd xmm2, [eax + 8] // src[8-11] 213 vpmulld xmm4, xmm4, xmmword ptr kHashMul1 214 vpmovzxbd xmm1, [eax + 12] // src[12-15] 215 vpmulld xmm2, xmm2, xmmword ptr kHashMul2 223 216 lea eax, [eax + 16] 224 pmulld xmm1, kHashMul3 225 paddd xmm3, xmm4 // add 16 results 226 paddd xmm1, xmm2 217 vpmulld xmm1, xmm1, xmmword ptr kHashMul3 218 vpaddd xmm3, xmm3, xmm4 // add 16 results 219 vpaddd xmm1, xmm1, xmm2 220 vpaddd xmm1, xmm1, xmm3 221 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords 222 vpaddd xmm1, xmm1,xmm2 223 vpshufd xmm2, xmm1, 0x01 224 vpaddd xmm1, xmm1, xmm2 225 vpaddd xmm0, xmm0, xmm1 227 226 sub ecx, 16 228 paddd xmm1, xmm3 229 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 230 paddd xmm1, xmm2 231 pshufd xmm2, xmm1, 0x01 232 paddd xmm1, xmm2 233 paddd xmm0, xmm1 234 jg wloop 235 236 movd eax, xmm0 // return hash 227 jg wloop 228 229 vmovd eax, xmm0 // return hash 230 vzeroupper 237 231 ret 238 232 } … … 240 234 #endif // _MSC_VER >= 1700 241 235 242 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)236 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 243 237 244 238 #ifdef __cplusplus
Note: See TracChangeset
for help on using the changeset viewer.