Changeset 5708 for pjproject/trunk/third_party/yuv/source/compare_win.cc
- Timestamp:
- Dec 4, 2017 7:23:36 AM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
pjproject/trunk/third_party/yuv/source/compare_win.cc
r5699 r5708 10 10 11 11 #include "libyuv/basic_types.h" 12 13 #include "libyuv/compare_row.h"14 12 #include "libyuv/row.h" 15 13 … … 23 21 #endif 24 22 25 // This module is for 32 bit Visual C x86 and clangcl26 23 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 27 24 25 /* Visual Studio 2005 doesn't support __popcnt(). */ 26 #if (_MSC_VER > 1400) 28 27 uint32 HammingDistance_SSE42(const uint8* src_a, 29 28 const uint8* src_b, … … 40 39 return diff; 41 40 } 42 43 __declspec(naked) uint32 44 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 45 __asm { 46 mov eax, [esp + 4] // src_a 47 mov edx, [esp + 8] // src_b 48 mov ecx, [esp + 12] // count 41 #endif 42 43 #if (_MSC_VER >= 1900) 44 __declspec(naked) 45 #else 46 __declspec(naked) __declspec(align(16)) 47 #endif 48 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 49 __asm { 50 mov eax, [esp + 4] // src_a 51 mov edx, [esp + 8] // src_b 52 mov ecx, [esp + 12] // count 49 53 pxor xmm0, xmm0 50 54 pxor xmm5, xmm5 51 55 52 wloop: 53 movdqu xmm1, [eax] 56 align 4 57 wloop: 58 movdqa xmm1, [eax] 54 59 lea eax, [eax + 16] 55 movdq uxmm2, [edx]60 movdqa xmm2, [edx] 56 61 lea edx, [edx + 16] 62 sub ecx, 16 57 63 movdqa xmm3, xmm1 // abs trick 58 64 psubusb xmm1, xmm2 … … 66 72 paddd xmm0, xmm1 67 73 paddd xmm0, xmm2 68 sub ecx, 1669 74 jg wloop 70 75 … … 81 86 #if _MSC_VER >= 1700 82 87 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 83 #pragma warning(disable : 4752) 84 __declspec(naked) uint32 85 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 86 __asm { 87 mov eax, [esp + 4] // src_a 88 mov edx, [esp + 8] // src_b 89 mov ecx, [esp + 12] // count 88 #pragma warning(disable: 4752) 89 #if (_MSC_VER >= 1900) 90 __declspec(naked) 91 #else 92 __declspec(naked) __declspec(align(16)) 93 #endif 94 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 95 __asm { 96 mov eax, [esp + 4] // src_a 97 mov edx, [esp + 8] // src_b 98 mov ecx, [esp + 12] // count 90 99 vpxor ymm0, ymm0, ymm0 // sum 91 100 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 92 101 sub edx, eax 93 102 103 align 4 94 104 wloop: 95 105 vmovdqu ymm1, [eax] 96 106 vmovdqu ymm2, [eax + edx] 97 107 lea eax, [eax + 32] 108 sub ecx, 32 98 109 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 99 110 vpsubusb ymm2, ymm2, ymm1 … … 105 116 vpaddd ymm0, ymm0, ymm1 106 117 vpaddd ymm0, ymm0, ymm2 107 sub ecx, 32108 118 jg wloop 109 119 … … 121 131 #endif // _MSC_VER >= 1700 122 132 123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 124 uvec32 kHashMul0 = { 125 0x0c3525e1, // 33 ^ 15 126 0xa3476dc1, // 33 ^ 14 127 0x3b4039a1, // 33 ^ 13 128 0x4f5f0981, // 33 ^ 12 129 }; 130 uvec32 kHashMul1 = { 131 0x30f35d61, // 33 ^ 11 132 0x855cb541, // 33 ^ 10 133 0x040a9121, // 33 ^ 9 134 0x747c7101, // 33 ^ 8 135 }; 136 uvec32 kHashMul2 = { 137 0xec41d4e1, // 33 ^ 7 138 0x4cfa3cc1, // 33 ^ 6 139 0x025528a1, // 33 ^ 5 140 0x00121881, // 33 ^ 4 141 }; 142 uvec32 kHashMul3 = { 143 0x00008c61, // 33 ^ 3 144 0x00000441, // 33 ^ 2 145 0x00000021, // 33 ^ 1 146 0x00000001, // 33 ^ 0 147 }; 148 149 __declspec(naked) uint32 150 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 151 __asm { 152 mov eax, [esp + 4] // src 153 mov ecx, [esp + 8] // count 133 #define HAS_HASHDJB2_SSE41 134 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 135 static uvec32 kHashMul0 = { 136 0x0c3525e1, // 33 ^ 15 137 0xa3476dc1, // 33 ^ 14 138 0x3b4039a1, // 33 ^ 13 139 0x4f5f0981, // 33 ^ 12 140 }; 141 static uvec32 kHashMul1 = { 142 0x30f35d61, // 33 ^ 11 143 0x855cb541, // 33 ^ 10 144 0x040a9121, // 33 ^ 9 145 0x747c7101, // 33 ^ 8 146 }; 147 static uvec32 kHashMul2 = { 148 0xec41d4e1, // 33 ^ 7 149 0x4cfa3cc1, // 33 ^ 6 150 0x025528a1, // 33 ^ 5 151 0x00121881, // 33 ^ 4 152 }; 153 static uvec32 kHashMul3 = { 154 0x00008c61, // 33 ^ 3 155 0x00000441, // 33 ^ 2 156 0x00000021, // 33 ^ 1 157 0x00000001, // 33 ^ 0 158 }; 159 160 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 161 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5 162 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 163 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 164 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5 165 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 166 _asm _emit 0x40 _asm _emit reg 167 168 #if (_MSC_VER >= 1900) 169 __declspec(naked) 170 #else 171 __declspec(naked) __declspec(align(16)) 172 #endif 173 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 174 __asm { 175 mov eax, [esp + 4] // src 176 mov ecx, [esp + 8] // count 154 177 movd xmm0, [esp + 12] // seed 155 178 156 pxor xmm7, xmm7 // constant 0 for unpck 157 movdqa xmm6, xmmword ptr kHash16x33 158 159 wloop: 160 movdqu xmm1, [eax] // src[0-15] 179 pxor xmm7, xmm7 // constant 0 for unpck 180 movdqa xmm6, kHash16x33 181 182 align 4 183 wloop: 184 movdqu xmm1, [eax] // src[0-15] 161 185 lea eax, [eax + 16] 162 pmulld xmm0, xmm6 //hash *= 33 ^ 16163 movdqa xmm5, xmmword ptrkHashMul0186 pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 187 movdqa xmm5, kHashMul0 164 188 movdqa xmm2, xmm1 165 punpcklbw xmm2, xmm7 // src[0-7]189 punpcklbw xmm2, xmm7 // src[0-7] 166 190 movdqa xmm3, xmm2 167 punpcklwd xmm3, xmm7 // src[0-3]168 pmulld xmm3, xmm5169 movdqa xmm5, xmmword ptrkHashMul1191 punpcklwd xmm3, xmm7 // src[0-3] 192 pmulld(0xdd) // pmulld xmm3, xmm5 193 movdqa xmm5, kHashMul1 170 194 movdqa xmm4, xmm2 171 punpckhwd xmm4, xmm7 // src[4-7]172 pmulld xmm4, xmm5173 movdqa xmm5, xmmword ptrkHashMul2174 punpckhbw xmm1, xmm7 // src[8-15]195 punpckhwd xmm4, xmm7 // src[4-7] 196 pmulld(0xe5) // pmulld xmm4, xmm5 197 movdqa xmm5, kHashMul2 198 punpckhbw xmm1, xmm7 // src[8-15] 175 199 movdqa xmm2, xmm1 176 punpcklwd xmm2, xmm7 // src[8-11] 177 pmulld xmm2, xmm5 178 movdqa xmm5, xmmword ptr kHashMul3 179 punpckhwd xmm1, xmm7 // src[12-15] 180 pmulld xmm1, xmm5 181 paddd xmm3, xmm4 // add 16 results 182 paddd xmm1, xmm2 200 punpcklwd xmm2, xmm7 // src[8-11] 201 pmulld(0xd5) // pmulld xmm2, xmm5 202 movdqa xmm5, kHashMul3 203 punpckhwd xmm1, xmm7 // src[12-15] 204 pmulld(0xcd) // pmulld xmm1, xmm5 205 paddd xmm3, xmm4 // add 16 results 206 paddd xmm1, xmm2 207 sub ecx, 16 183 208 paddd xmm1, xmm3 184 209 … … 188 213 paddd xmm1, xmm2 189 214 paddd xmm0, xmm1 190 sub ecx, 16 191 jg wloop 192 193 movd eax, xmm0 // return hash 215 jg wloop 216 217 movd eax, xmm0 // return hash 194 218 ret 195 219 } … … 198 222 // Visual C 2012 required for AVX2. 199 223 #if _MSC_VER >= 1700 200 __declspec(naked) uint32 201 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 202 __asm { 203 mov eax, [esp + 4] // src 204 mov ecx, [esp + 8] // count 205 vmovd xmm0, [esp + 12] // seed 206 207 wloop: 208 vpmovzxbd xmm3, [eax] // src[0-3] 209 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 210 vpmovzxbd xmm4, [eax + 4] // src[4-7] 211 vpmulld xmm3, xmm3, xmmword ptr kHashMul0 212 vpmovzxbd xmm2, [eax + 8] // src[8-11] 213 vpmulld xmm4, xmm4, xmmword ptr kHashMul1 214 vpmovzxbd xmm1, [eax + 12] // src[12-15] 215 vpmulld xmm2, xmm2, xmmword ptr kHashMul2 224 #if (_MSC_VER >= 1900) 225 __declspec(naked) 226 #else 227 __declspec(naked) __declspec(align(16)) 228 #endif 229 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 230 __asm { 231 mov eax, [esp + 4] // src 232 mov ecx, [esp + 8] // count 233 movd xmm0, [esp + 12] // seed 234 movdqa xmm6, kHash16x33 235 236 align 4 237 wloop: 238 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] 239 pmulld xmm0, xmm6 // hash *= 33 ^ 16 240 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] 241 pmulld xmm3, kHashMul0 242 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] 243 pmulld xmm4, kHashMul1 244 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] 245 pmulld xmm2, kHashMul2 216 246 lea eax, [eax + 16] 217 vpmulld xmm1, xmm1, xmmword ptr kHashMul3 218 vpaddd xmm3, xmm3, xmm4 // add 16 results 219 vpaddd xmm1, xmm1, xmm2 220 vpaddd xmm1, xmm1, xmm3 221 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords 222 vpaddd xmm1, xmm1,xmm2 223 vpshufd xmm2, xmm1, 0x01 224 vpaddd xmm1, xmm1, xmm2 225 vpaddd xmm0, xmm0, xmm1 247 pmulld xmm1, kHashMul3 248 paddd xmm3, xmm4 // add 16 results 249 paddd xmm1, xmm2 226 250 sub ecx, 16 227 jg wloop 228 229 vmovd eax, xmm0 // return hash 230 vzeroupper 251 paddd xmm1, xmm3 252 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 253 paddd xmm1, xmm2 254 pshufd xmm2, xmm1, 0x01 255 paddd xmm1, xmm2 256 paddd xmm0, xmm1 257 jg wloop 258 259 movd eax, xmm0 // return hash 231 260 ret 232 261 } … … 234 263 #endif // _MSC_VER >= 1700 235 264 236 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 265 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 237 266 238 267 #ifdef __cplusplus
Note: See TracChangeset
for help on using the changeset viewer.