OpenCV 4.5.3(日本語機械翻訳)
intrin_sse.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 #define CV_SIMD128 1
52 #define CV_SIMD128_64F 1
53 #define CV_SIMD128_FP16 0 // no native operations with FP16 type.
54
55 namespace cv
56{
57
59
60 //
61 // Compilation troubleshooting:
62 // - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
63 // Replace parameter declaration to const reference:
64 // -v_int32x4 a
65 // +const v_int32x4& a
66 //
67
68CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
69
71
72 struct v_uint8x16
73{
74 typedef uchar lane_type;
75 typedef __m128i vector_type;
76 enum { nlanes = 16 };
77
78 /* coverity[uninit_ctor]: suppress warning */
79 v_uint8x16() {}
80 explicit v_uint8x16(__m128i v) : val(v) {}
81 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83 {
84 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
85 (char)v4, (char)v5, (char)v6, (char)v7,
86 (char)v8, (char)v9, (char)v10, (char)v11,
87 (char)v12, (char)v13, (char)v14, (char)v15);
88 }
89
90 uchar get0() const
91 {
92 return (uchar)_mm_cvtsi128_si32(val);
93 }
94
95 __m128i val;
96};
97
98 struct v_int8x16
99{
100 typedef schar lane_type;
101 typedef __m128i vector_type;
102 enum { nlanes = 16 };
103
104 /* coverity[uninit_ctor]: suppress warning */
105 v_int8x16() {}
106 explicit v_int8x16(__m128i v) : val(v) {}
107 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
109 {
110 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
111 (char)v4, (char)v5, (char)v6, (char)v7,
112 (char)v8, (char)v9, (char)v10, (char)v11,
113 (char)v12, (char)v13, (char)v14, (char)v15);
114 }
115
116 schar get0() const
117 {
118 return (schar)_mm_cvtsi128_si32(val);
119 }
120
121 __m128i val;
122};
123
124 struct v_uint16x8
125{
126 typedef ushort lane_type;
127 typedef __m128i vector_type;
128 enum { nlanes = 8 };
129
130 /* coverity[uninit_ctor]: suppress warning */
131 v_uint16x8() {}
132 explicit v_uint16x8(__m128i v) : val(v) {}
133 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
134 {
135 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
136 (short)v4, (short)v5, (short)v6, (short)v7);
137 }
138
139 ushort get0() const
140 {
141 return (ushort)_mm_cvtsi128_si32(val);
142 }
143
144 __m128i val;
145};
146
147 struct v_int16x8
148{
149 typedef short lane_type;
150 typedef __m128i vector_type;
151 enum { nlanes = 8 };
152
153 /* coverity[uninit_ctor]: suppress warning */
154 v_int16x8() {}
155 explicit v_int16x8(__m128i v) : val(v) {}
156 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
157 {
158 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
159 (short)v4, (short)v5, (short)v6, (short)v7);
160 }
161
162 short get0() const
163 {
164 return (short)_mm_cvtsi128_si32(val);
165 }
166
167 __m128i val;
168};
169
170 struct v_uint32x4
171{
172 typedef unsigned lane_type;
173 typedef __m128i vector_type;
174 enum { nlanes = 4 };
175
176 /* coverity[uninit_ctor]: suppress warning */
177 v_uint32x4() {}
178 explicit v_uint32x4(__m128i v) : val(v) {}
179 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
180 {
181 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
182 }
183
184 unsigned get0() const
185 {
186 return (unsigned)_mm_cvtsi128_si32(val);
187 }
188
189 __m128i val;
190};
191
192 struct v_int32x4
193{
194 typedef int lane_type;
195 typedef __m128i vector_type;
196 enum { nlanes = 4 };
197
198 /* coverity[uninit_ctor]: suppress warning */
199 v_int32x4() {}
200 explicit v_int32x4(__m128i v) : val(v) {}
201 v_int32x4(int v0, int v1, int v2, int v3)
202 {
203 val = _mm_setr_epi32(v0, v1, v2, v3);
204 }
205
206 int get0() const
207 {
208 return _mm_cvtsi128_si32(val);
209 }
210
211 __m128i val;
212};
213
214 struct v_float32x4
215{
216 typedef float lane_type;
217 typedef __m128 vector_type;
218 enum { nlanes = 4 };
219
220 /* coverity[uninit_ctor]: suppress warning */
221 v_float32x4() {}
222 explicit v_float32x4(__m128 v) : val(v) {}
223 v_float32x4(float v0, float v1, float v2, float v3)
224 {
225 val = _mm_setr_ps(v0, v1, v2, v3);
226 }
227
228 float get0() const
229 {
230 return _mm_cvtss_f32(val);
231 }
232
233 __m128 val;
234};
235
236 struct v_uint64x2
237{
238 typedef uint64 lane_type;
239 typedef __m128i vector_type;
240 enum { nlanes = 2 };
241
242 /* coverity[uninit_ctor]: suppress warning */
243 v_uint64x2() {}
244 explicit v_uint64x2(__m128i v) : val(v) {}
245 v_uint64x2(uint64 v0, uint64 v1)
246 {
247 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
248 }
249
250 uint64 get0() const
251 {
252 #if !defined(__x86_64__) && !defined(_M_X64)
253 int a = _mm_cvtsi128_si32(val);
254 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
255 return (unsigned)a | ((uint64)(unsigned)b << 32);
256 #else
257 return (uint64)_mm_cvtsi128_si64(val);
258 #endif
259 }
260
261 __m128i val;
262};
263
264 struct v_int64x2
265{
266 typedef int64 lane_type;
267 typedef __m128i vector_type;
268 enum { nlanes = 2 };
269
270 /* coverity[uninit_ctor]: suppress warning */
271 v_int64x2() {}
272 explicit v_int64x2(__m128i v) : val(v) {}
273 v_int64x2(int64 v0, int64 v1)
274 {
275 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
276 }
277
278 int64 get0() const
279 {
280 #if !defined(__x86_64__) && !defined(_M_X64)
281 int a = _mm_cvtsi128_si32(val);
282 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
283 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
284 #else
285 return _mm_cvtsi128_si64(val);
286 #endif
287 }
288
289 __m128i val;
290};
291
292 struct v_float64x2
293{
294 typedef double lane_type;
295 typedef __m128d vector_type;
296 enum { nlanes = 2 };
297
298 /* coverity[uninit_ctor]: suppress warning */
299 v_float64x2() {}
300 explicit v_float64x2(__m128d v) : val(v) {}
301 v_float64x2(double v0, double v1)
302 {
303 val = _mm_setr_pd(v0, v1);
304 }
305
306 double get0() const
307 {
308 return _mm_cvtsd_f64(val);
309 }
310
311 __m128d val;
312};
313
314 namespace hal_sse_internal
315{
316 template <typename to_sse_type, typename from_sse_type>
317 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
318
319 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
320 template<> inline \
321 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
322 { return sse_cast_intrin(a); }
323
324 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
325 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
326 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
327 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
328 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
329 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
330 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
331 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
332 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
333}
334
335 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
336 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
337 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
338 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
339 { return _Tpvec(cast(a.val)); }
340
341OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
342OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
343OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
344OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
345OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
346OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
347OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
348OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
349
350 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
351 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
352 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
353 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
354
355 template<typename _Tpvec> inline
356 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
357 template<typename _Tpvec> inline
358 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
359 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
360{ return v_float32x4(_mm_castsi128_ps(a.val)); }
361 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
362{ return v_float32x4(_mm_castsi128_ps(a.val)); }
363 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
364{ return v_float64x2(_mm_castsi128_pd(a.val)); }
365 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
366{ return v_float64x2(_mm_castsi128_pd(a.val)); }
367
368 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
369 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
370 { return _Tpvec(_mm_castps_si128(a.val)); } \
371 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
372 { return _Tpvec(_mm_castpd_si128(a.val)); }
373
374OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
375OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
376OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
377OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
378OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
379OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
380OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
381OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
382
383 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
384 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
385 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
386 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
387
389 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
390{
391 __m128i delta = _mm_set1_epi16(255);
392 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
393 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
394}
395
396 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
397{
398 __m128i delta = _mm_set1_epi16(255);
399 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
400 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
401}
402
403 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
404{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
405
406 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
407{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
408
409 template<int n> inline
410 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
411{
412 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
413 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
414 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
415 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
416}
417
418 template<int n> inline
419 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
420{
421 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
422 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
423 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
424}
425
426 template<int n> inline
427 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
428{
429 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
430 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
431 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
432}
433
434 template<int n> inline
435 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
436{
437 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
438 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
439 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
440}
441
442 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
443{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
444
445 inline void v_pack_store(schar* ptr, const v_int16x8& a)
446{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
447
448 template<int n> inline
449 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
450{
451 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
452 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
453 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
454 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
455}
456 template<int n> inline
457 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
458{
459 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
460 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
461 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
462 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
463}
464
465
466 // byte-wise "mask ? a : b"
467 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
468{
469 #if CV_SSE4_1
470 return _mm_blendv_epi8(b, a, mask);
471 #else
472 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
473 #endif
474}
475
476 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
477{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
478
479 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
480{
481 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
482 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
483 __m128i r = _mm_packs_epi32(a1, a1);
484 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
485}
486
487 template<int n> inline
488 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
489{
490 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
491 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
492 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
493 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
494}
495
496 template<int n> inline
497 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
498{
499 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
500 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
501 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
502 _mm_storel_epi64((__m128i*)ptr, a2);
503}
504
505 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
506{
507 #if CV_SSE4_1
508 return v_uint16x8(_mm_packus_epi32(a.val, b.val));
509 #else
510 __m128i delta32 = _mm_set1_epi32(32768);
511
512 // preliminary saturate negative values to zero
513 __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
514 __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
515
516 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
517 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
518 #endif
519}
520
521 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
522{
523 #if CV_SSE4_1
524 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
525 #else
526 __m128i delta32 = _mm_set1_epi32(32768);
527 __m128i a1 = _mm_sub_epi32(a.val, delta32);
528 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
529 _mm_storel_epi64((__m128i*)ptr, r);
530 #endif
531}
532
533 template<int n> inline
534 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
535{
536 #if CV_SSE4_1
537 __m128i delta = _mm_set1_epi32(1 << (n - 1));
538 return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
539 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
540 #else
541 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
542 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
543 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
544 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
545 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
546 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
547 #endif
548}
549
550 template<int n> inline
551 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
552{
553 #if CV_SSE4_1
554 __m128i delta = _mm_set1_epi32(1 << (n - 1));
555 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
556 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
557 #else
558 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
559 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
560 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
561 _mm_storel_epi64((__m128i*)ptr, a2);
562 #endif
563}
564
565 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
566{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
567
568 inline void v_pack_store(short* ptr, const v_int32x4& a)
569{
570 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
571}
572
573 template<int n> inline
574 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
575{
576 __m128i delta = _mm_set1_epi32(1 << (n-1));
577 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
578 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
579}
580
581 template<int n> inline
582 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
583{
584 __m128i delta = _mm_set1_epi32(1 << (n-1));
585 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
586 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
587}
588
589
590 // [a0 0 | b0 0] [a1 0 | b1 0]
591 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
592{
593 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
594 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
595 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
596}
597
598 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
599{
600 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
601 _mm_storel_epi64((__m128i*)ptr, a1);
602}
603
604 // [a0 0 | b0 0] [a1 0 | b1 0]
605 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
606{
607 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
608 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
609 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
610}
611
612 inline void v_pack_store(int* ptr, const v_int64x2& a)
613{
614 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
615 _mm_storel_epi64((__m128i*)ptr, a1);
616}
617
618 template<int n> inline
619 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
620{
621 uint64 delta = (uint64)1 << (n-1);
622 v_uint64x2 delta2(delta, delta);
623 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
624 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
625 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
626 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
627 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
628}
629
630 template<int n> inline
631 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
632{
633 uint64 delta = (uint64)1 << (n-1);
634 v_uint64x2 delta2(delta, delta);
635 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
637 _mm_storel_epi64((__m128i*)ptr, a2);
638}
639
640 inline __m128i v_sign_epi64(__m128i a)
641{
642 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
643}
644
645 inline __m128i v_srai_epi64(__m128i a, int imm)
646{
647 __m128i smask = v_sign_epi64(a);
648 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
649}
650
651 template<int n> inline
652 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
653{
654 int64 delta = (int64)1 << (n-1);
655 v_int64x2 delta2(delta, delta);
656 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
657 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
658 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
659 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
660 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
661}
662
663 template<int n> inline
664 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
665{
666 int64 delta = (int64)1 << (n-1);
667 v_int64x2 delta2(delta, delta);
668 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
670 _mm_storel_epi64((__m128i*)ptr, a2);
671}
672
673 // pack boolean
674 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
675{
676 __m128i ab = _mm_packs_epi16(a.val, b.val);
677 return v_uint8x16(ab);
678}
679
680 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
681 const v_uint32x4& c, const v_uint32x4& d)
682{
683 __m128i ab = _mm_packs_epi32(a.val, b.val);
684 __m128i cd = _mm_packs_epi32(c.val, d.val);
685 return v_uint8x16(_mm_packs_epi16(ab, cd));
686}
687
688 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
689 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
690 const v_uint64x2& g, const v_uint64x2& h)
691{
692 __m128i ab = _mm_packs_epi32(a.val, b.val);
693 __m128i cd = _mm_packs_epi32(c.val, d.val);
694 __m128i ef = _mm_packs_epi32(e.val, f.val);
695 __m128i gh = _mm_packs_epi32(g.val, h.val);
696
697 __m128i abcd = _mm_packs_epi32(ab, cd);
698 __m128i efgh = _mm_packs_epi32(ef, gh);
699 return v_uint8x16(_mm_packs_epi16(abcd, efgh));
700}
701
702 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
703 const v_float32x4& m1, const v_float32x4& m2,
704 const v_float32x4& m3)
705{
706 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
707 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
708 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
709 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
710
711 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
712}
713
714 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
715 const v_float32x4& m1, const v_float32x4& m2,
716 const v_float32x4& a)
717{
718 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721
722 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
723}
724
725 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
726 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
727 { \
728 return _Tpvec(intrin(a.val, b.val)); \
729 } \
730 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
731 { \
732 a.val = intrin(a.val, b.val); \
733 return a; \
734 }
735
736OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
737OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
738OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
739OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
740OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
741OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
742OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
743OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
744OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
745OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
746OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
747OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
748OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
749OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
750OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
751OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
752OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
753OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
754OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
755OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
756OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
757OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
758OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
759OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
760OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
761OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
762
763 // saturating multiply 8-bit, 16-bit
764 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
765 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
766 { \
767 _Tpwvec c, d; \
768 v_mul_expand(a, b, c, d); \
769 return v_pack(c, d); \
770 } \
771 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
772 { a = a * b; return a; }
773
774OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
775OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
776OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
777OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
778
779 // Multiply and expand
780 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
781 v_uint16x8& c, v_uint16x8& d)
782{
783 v_uint16x8 a0, a1, b0, b1;
784 v_expand(a, a0, a1);
785 v_expand(b, b0, b1);
786 c = v_mul_wrap(a0, b0);
787 d = v_mul_wrap(a1, b1);
788}
789
790 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
791 v_int16x8& c, v_int16x8& d)
792{
793 v_int16x8 a0, a1, b0, b1;
794 v_expand(a, a0, a1);
795 v_expand(b, b0, b1);
796 c = v_mul_wrap(a0, b0);
797 d = v_mul_wrap(a1, b1);
798}
799
800 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
801 v_int32x4& c, v_int32x4& d)
802{
803 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
804 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
805 c.val = _mm_unpacklo_epi16(v0, v1);
806 d.val = _mm_unpackhi_epi16(v0, v1);
807}
808
809 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
810 v_uint32x4& c, v_uint32x4& d)
811{
812 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
813 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
814 c.val = _mm_unpacklo_epi16(v0, v1);
815 d.val = _mm_unpackhi_epi16(v0, v1);
816}
817
818 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
819 v_uint64x2& c, v_uint64x2& d)
820{
821 __m128i c0 = _mm_mul_epu32(a.val, b.val);
822 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
823 c.val = _mm_unpacklo_epi64(c0, c1);
824 d.val = _mm_unpackhi_epi64(c0, c1);
825}
826
827 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
828 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
829
831
832 // 16 >> 32
833 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
834{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
835 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
836{ return v_dotprod(a, b) + c; }
837
838 // 32 >> 64
839 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
840{
841 #if CV_SSE4_1
842 __m128i even = _mm_mul_epi32(a.val, b.val);
843 __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
844 return v_int64x2(_mm_add_epi64(even, odd));
845 #else
846 __m128i even_u = _mm_mul_epu32(a.val, b.val);
847 __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
848 // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
849 __m128i a_sign = _mm_srai_epi32(a.val, 31);
850 __m128i b_sign = _mm_srai_epi32(b.val, 31);
851 // |x * sign of x
852 __m128i axb = _mm_and_si128(a.val, b_sign);
853 __m128i bxa = _mm_and_si128(b.val, a_sign);
854 // sum of sign corrections
855 __m128i ssum = _mm_add_epi32(bxa, axb);
856 __m128i even_ssum = _mm_slli_epi64(ssum, 32);
857 __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
858 // convert to signed and prod
859 return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
860 #endif
861}
862 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
863{ return v_dotprod(a, b) + c; }
864
865 // 8 >> 32
866 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
867{
868 __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
869 __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
870 __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
871 __m128i b1 = _mm_srli_epi16(b.val, 8);
872 __m128i p0 = _mm_madd_epi16(a0, b0);
873 __m128i p1 = _mm_madd_epi16(a1, b1);
874 return v_uint32x4(_mm_add_epi32(p0, p1));
875}
876 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
877{ return v_dotprod_expand(a, b) + c; }
878
879 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
880{
881 __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
882 __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
883 __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
884 __m128i b1 = _mm_srai_epi16(b.val, 8);
885 __m128i p0 = _mm_madd_epi16(a0, b0);
886 __m128i p1 = _mm_madd_epi16(a1, b1);
887 return v_int32x4(_mm_add_epi32(p0, p1));
888}
889 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
890{ return v_dotprod_expand(a, b) + c; }
891
892 // 16 >> 64
893 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
894{
895 v_uint32x4 c, d;
896 v_mul_expand(a, b, c, d);
897
898 v_uint64x2 c0, c1, d0, d1;
899 v_expand(c, c0, c1);
900 v_expand(d, d0, d1);
901
902 c0 += c1; d0 += d1;
903 return v_uint64x2(_mm_add_epi64(
904 _mm_unpacklo_epi64(c0.val, d0.val),
905 _mm_unpackhi_epi64(c0.val, d0.val)
906 ));
907}
908 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
909{ return v_dotprod_expand(a, b) + c; }
910
911 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
912{
913 v_int32x4 prod = v_dotprod(a, b);
914 v_int64x2 c, d;
915 v_expand(prod, c, d);
916 return v_int64x2(_mm_add_epi64(
917 _mm_unpacklo_epi64(c.val, d.val),
918 _mm_unpackhi_epi64(c.val, d.val)
919 ));
920}
921 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
922{ return v_dotprod_expand(a, b) + c; }
923
924 // 32 >> 64f
925 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
926{
927 #if CV_SSE4_1
928 return v_cvt_f64(v_dotprod(a, b));
929 #else
930 v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
932
933 return v_float64x2(_mm_add_pd(
934 _mm_unpacklo_pd(c.val, d.val),
935 _mm_unpackhi_pd(c.val, d.val)
936 ));
937 #endif
938}
939 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
940{ return v_dotprod_expand(a, b) + c; }
941
943
944 // 16 >> 32
945 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
946{ return v_dotprod(a, b); }
947 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
948{ return v_dotprod(a, b) + c; }
949
950 // 32 >> 64
951 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
952{ return v_dotprod(a, b); }
953 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
954{ return v_dotprod_fast(a, b) + c; }
955
956 // 8 >> 32
957 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
958{
959 __m128i a0 = v_expand_low(a).val;
960 __m128i a1 = v_expand_high(a).val;
961 __m128i b0 = v_expand_low(b).val;
962 __m128i b1 = v_expand_high(b).val;
963 __m128i p0 = _mm_madd_epi16(a0, b0);
964 __m128i p1 = _mm_madd_epi16(a1, b1);
965 return v_uint32x4(_mm_add_epi32(p0, p1));
966}
967 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
968{ return v_dotprod_expand_fast(a, b) + c; }
969
970 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
971{
972 #if CV_SSE4_1
973 __m128i a0 = _mm_cvtepi8_epi16(a.val);
974 __m128i a1 = v_expand_high(a).val;
975 __m128i b0 = _mm_cvtepi8_epi16(b.val);
976 __m128i b1 = v_expand_high(b).val;
977 __m128i p0 = _mm_madd_epi16(a0, b0);
978 __m128i p1 = _mm_madd_epi16(a1, b1);
979 return v_int32x4(_mm_add_epi32(p0, p1));
980 #else
981 return v_dotprod_expand(a, b);
982 #endif
983}
984 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
985{ return v_dotprod_expand_fast(a, b) + c; }
986
987 // 16 >> 64
988 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
989{
990 v_uint32x4 c, d;
991 v_mul_expand(a, b, c, d);
992
993 v_uint64x2 c0, c1, d0, d1;
994 v_expand(c, c0, c1);
995 v_expand(d, d0, d1);
996
997 c0 += c1; d0 += d1;
998 return c0 + d0;
999}
1000 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1001{ return v_dotprod_expand_fast(a, b) + c; }
1002
1003 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1004{
1005 v_int32x4 prod = v_dotprod(a, b);
1006 v_int64x2 c, d;
1007 v_expand(prod, c, d);
1008 return c + d;
1009}
1010 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1011{ return v_dotprod_expand_fast(a, b) + c; }
1012
1013 // 32 >> 64f
1014 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
1015 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1016{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
1017 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1018{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
1019
1020 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1021 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1022 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1023 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1024 inline _Tpvec operator ~ (const _Tpvec& a) \
1025 { \
1026 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1027 }
1028
1029OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
1030OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
1031OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
1032OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
1033OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
1034OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
1035OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
1036OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
1037OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1038OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1039
1040inline v_float32x4 v_sqrt(const v_float32x4& x)
1041{ return v_float32x4(_mm_sqrt_ps(x.val)); }
1042
1043 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1044{
1045 const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1046 __m128 t = x.val;
1047 __m128 h = _mm_mul_ps(t, _0_5);
1048 t = _mm_rsqrt_ps(t);
1049 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1050 return v_float32x4(t);
1051}
1052
1053 inline v_float64x2 v_sqrt(const v_float64x2& x)
1054{ return v_float64x2(_mm_sqrt_pd(x.val)); }
1055
1056 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1057{
1058 const __m128d v_1 = _mm_set1_pd(1.);
1059 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1060}
1061
1062 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1063 inline _Tpuvec v_abs(const _Tpsvec& x) \
1064 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1065
1066OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
1067OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
1068 inline v_uint32x4 v_abs(const v_int32x4& x)
1069{
1070 __m128i s = _mm_srli_epi32(x.val, 31);
1071 __m128i f = _mm_srai_epi32(x.val, 31);
1072 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1073}
1074 inline v_float32x4 v_abs(const v_float32x4& x)
1075{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
1076 inline v_float64x2 v_abs(const v_float64x2& x)
1077{
1078 return v_float64x2(_mm_and_pd(x.val,
1079 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1080}
1081
1082 // TODO: exp, log, sin, cos
1083
1084 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1085 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1086 { \
1087 return _Tpvec(intrin(a.val, b.val)); \
1088 }
1089
1090OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
1091OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
1092OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
1093OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
1094OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
1095OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
1096OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
1097OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
1098
1099 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
1100{
1101 #if CV_SSE4_1
1102 return v_int8x16(_mm_min_epi8(a.val, b.val));
1103 #else
1104 __m128i delta = _mm_set1_epi8((char)-128);
1105 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1106 _mm_xor_si128(b.val, delta))));
1107 #endif
1108}
1109 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
1110{
1111 #if CV_SSE4_1
1112 return v_int8x16(_mm_max_epi8(a.val, b.val));
1113 #else
1114 __m128i delta = _mm_set1_epi8((char)-128);
1115 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1116 _mm_xor_si128(b.val, delta))));
1117 #endif
1118}
1119 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
1120{
1121 #if CV_SSE4_1
1122 return v_uint16x8(_mm_min_epu16(a.val, b.val));
1123 #else
1124 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1125 #endif
1126}
1127 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
1128{
1129 #if CV_SSE4_1
1130 return v_uint16x8(_mm_max_epu16(a.val, b.val));
1131 #else
1132 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1133 #endif
1134}
1135 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
1136{
1137 #if CV_SSE4_1
1138 return v_uint32x4(_mm_min_epu32(a.val, b.val));
1139 #else
1140 __m128i delta = _mm_set1_epi32((int)0x80000000);
1141 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1142 return v_uint32x4(v_select_si128(mask, b.val, a.val));
1143 #endif
1144}
1145 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
1146{
1147 #if CV_SSE4_1
1148 return v_uint32x4(_mm_max_epu32(a.val, b.val));
1149 #else
1150 __m128i delta = _mm_set1_epi32((int)0x80000000);
1151 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1152 return v_uint32x4(v_select_si128(mask, a.val, b.val));
1153 #endif
1154}
1155 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
1156{
1157 #if CV_SSE4_1
1158 return v_int32x4(_mm_min_epi32(a.val, b.val));
1159 #else
1160 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1161 #endif
1162}
1163 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
1164{
1165 #if CV_SSE4_1
1166 return v_int32x4(_mm_max_epi32(a.val, b.val));
1167 #else
1168 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1169 #endif
1170}
1171
1172 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1173 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1174 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1175 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1176 { \
1177 __m128i not_mask = _mm_set1_epi32(-1); \
1178 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1179 } \
1180 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1181 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1182 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1183 { \
1184 __m128i not_mask = _mm_set1_epi32(-1); \
1185 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1186 } \
1187 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1188 { \
1189 __m128i smask = _mm_set1_##suffix(sbit); \
1190 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1191 } \
1192 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1193 { \
1194 __m128i smask = _mm_set1_##suffix(sbit); \
1195 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1196 } \
1197 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1198 { \
1199 __m128i smask = _mm_set1_##suffix(sbit); \
1200 __m128i not_mask = _mm_set1_epi32(-1); \
1201 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1202 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1203 } \
1204 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1205 { \
1206 __m128i smask = _mm_set1_##suffix(sbit); \
1207 __m128i not_mask = _mm_set1_epi32(-1); \
1208 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1209 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1210 } \
1211 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1212 { \
1213 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1214 } \
1215 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1216 { \
1217 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1218 } \
1219 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1220 { \
1221 __m128i not_mask = _mm_set1_epi32(-1); \
1222 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1223 } \
1224 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1225 { \
1226 __m128i not_mask = _mm_set1_epi32(-1); \
1227 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1228 }
1229
1230OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
1231OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
1232OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
1233
1234 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1235 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1236 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1237 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1238 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1239 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1240 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1241 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1242 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1243 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1244 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1245 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1246 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1247
1248OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1249OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1250
1251 #if CV_SSE4_1
1252 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1253 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1254 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1255 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1256 { return ~(a == b); }
1257 #else
1258 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1259 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1260 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1261 return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1262 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1263 { return ~(a == b); }
1264 #endif
1265
1266OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
1267OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
1268
1269 inline v_float32x4 v_not_nan(const v_float32x4& a)
1270{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
1271 inline v_float64x2 v_not_nan(const v_float64x2& a)
1272{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1273
1274OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1275OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1276OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1277OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1278OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1279OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1280OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1281OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1282OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1283OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1284
1285 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1286{
1287 __m128i ad = _mm_srai_epi16(a.val, 8);
1288 __m128i bd = _mm_srai_epi16(b.val, 8);
1289 __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
1290 __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
1291 const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1292 return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1293}
1294 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1295{
1296 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1297}
1298
1301 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1302{ return v_add_wrap(a - b, b - a); }
1303 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1304{ return v_add_wrap(a - b, b - a); }
1305 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1306{ return v_max(a, b) - v_min(a, b); }
1307
1308 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1309{
1310 v_int8x16 d = v_sub_wrap(a, b);
1311 v_int8x16 m = a < b;
1312 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1313}
1314 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1315{
1316 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1317}
1318 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1319{
1320 v_int32x4 d = a - b;
1321 v_int32x4 m = a < b;
1322 return v_reinterpret_as_u32((d ^ m) - m);
1323}
1324
1326 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1327{
1328 v_int8x16 d = a - b;
1329 v_int8x16 m = a < b;
1330 return (d ^ m) - m;
1331 }
1332 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1333{ return v_max(a, b) - v_min(a, b); }
1334
1335
1336 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1337{
1338 return a * b + c;
1339}
1340
1341 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1342{
1343 return v_fma(a, b, c);
1344}
1345
1346 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1347{
1348 #if CV_FMA3
1349 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1350 #else
1351 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1352 #endif
1353}
1354
1355 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1356{
1357 #if CV_FMA3
1358 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1359 #else
1360 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1361 #endif
1362}
1363
1364 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1365 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1366 { \
1367 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1368 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1369 } \
1370 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1371 { \
1372 _Tpvec res = v_fma(a, a, b*b); \
1373 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1374 } \
1375 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1376 { \
1377 return v_fma(a, a, b*b); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381 return v_fma(a, b, c); \
1382 }
1383
1384OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1385OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1386
1387 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402 return _Tpsvec(srai(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422 return _Tpsvec(srai(a.val, imm)); \
1423 }
1424
1425OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1426OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1427OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1428
1429 namespace hal_sse_internal
1430{
1431 template <int imm,
1432 bool is_invalid = ((imm < 0) || (imm > 16)),
1433 bool is_first = (imm == 0),
1434 bool is_half = (imm == 8),
1435 bool is_second = (imm == 16),
1436 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1437 class v_sse_palignr_u8_class;
1438
1439 template <int imm>
1440 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1441
1442 template <int imm>
1443 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1444 {
1445 public:
1446 inline __m128i operator()(const __m128i& a, const __m128i&) const
1447 {
1448 return a;
1449 }
1450 };
1451
1452 template <int imm>
1453 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1454 {
1455 public:
1456 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1457 {
1458 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1459 }
1460 };
1461
1462 template <int imm>
1463 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1464 {
1465 public:
1466 inline __m128i operator()(const __m128i&, const __m128i& b) const
1467 {
1468 return b;
1469 }
1470 };
1471
1472 template <int imm>
1473 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1474 {
1475 #if CV_SSSE3
1476 public:
1477 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1478 {
1479 return _mm_alignr_epi8(b, a, imm);
1480 }
1481 #else
1482 public:
1483 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1484 {
1485 enum { imm2 = (sizeof(__m128i) - imm) };
1486 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1487 }
1488 #endif
1489 };
1490
1491 template <int imm>
1492 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1493 {
1494 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1495 return v_sse_palignr_u8_class<imm>()(a, b);
1496 }
1497}
1498
1499 template<int imm, typename _Tpvec>
1500 inline _Tpvec v_rotate_right(const _Tpvec &a)
1501{
1502 using namespace hal_sse_internal;
1503 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1504 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1505 _mm_srli_si128(
1506 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1507}
1508
1509 template<int imm, typename _Tpvec>
1510 inline _Tpvec v_rotate_left(const _Tpvec &a)
1511{
1512 using namespace hal_sse_internal;
1513 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1514 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1515 _mm_slli_si128(
1516 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1517}
1518
1519 template<int imm, typename _Tpvec>
1520 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1521{
1522 using namespace hal_sse_internal;
1523 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1524 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1525 v_sse_palignr_u8<imm2>(
1526 v_sse_reinterpret_as<__m128i>(a.val),
1527 v_sse_reinterpret_as<__m128i>(b.val))));
1528}
1529
1530 template<int imm, typename _Tpvec>
1531 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1532{
1533 using namespace hal_sse_internal;
1534 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1535 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1536 v_sse_palignr_u8<imm2>(
1537 v_sse_reinterpret_as<__m128i>(b.val),
1538 v_sse_reinterpret_as<__m128i>(a.val))));
1539}
1540
1541 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1542 inline _Tpvec v_load(const _Tp* ptr) \
1543 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1544 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1545 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1546 inline _Tpvec v_load_low(const _Tp* ptr) \
1547 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1548 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1549 { \
1550 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1551 _mm_loadl_epi64((const __m128i*)ptr1))); \
1552 } \
1553 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1554 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1555 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1556 { _mm_store_si128((__m128i*)ptr, a.val); } \
1557 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1558 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1559 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1560 { \
1561 if( mode == hal::STORE_UNALIGNED ) \
1562 _mm_storeu_si128((__m128i*)ptr, a.val); \
1563 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1564 _mm_stream_si128((__m128i*)ptr, a.val); \
1565 else \
1566 _mm_store_si128((__m128i*)ptr, a.val); \
1567 } \
1568 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1569 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1570 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1571 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1572
1573OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1574OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1575OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1576OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1577OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1578OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1579OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1580OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1581
1582 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1583 inline _Tpvec v_load(const _Tp* ptr) \
1584 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1585 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1586 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1587 inline _Tpvec v_load_low(const _Tp* ptr) \
1588 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1589 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1590 { \
1591 return _Tpvec(_mm_castsi128_##suffix( \
1592 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1593 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1594 } \
1595 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1596 { _mm_storeu_##suffix(ptr, a.val); } \
1597 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1598 { _mm_store_##suffix(ptr, a.val); } \
1599 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1600 { _mm_stream_##suffix(ptr, a.val); } \
1601 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1602 { \
1603 if( mode == hal::STORE_UNALIGNED ) \
1604 _mm_storeu_##suffix(ptr, a.val); \
1605 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1606 _mm_stream_##suffix(ptr, a.val); \
1607 else \
1608 _mm_store_##suffix(ptr, a.val); \
1609 } \
1610 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1611 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1612 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1613 { \
1614 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1615 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1616 }
1617
1618OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1619OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1620
1621 inline unsigned v_reduce_sum(const v_uint8x16& a)
1622{
1623 __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1624 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1625}
1626 inline int v_reduce_sum(const v_int8x16& a)
1627{
1628 __m128i half = _mm_set1_epi8((schar)-128);
1629 half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1630 return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1631}
1632 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1633 inline schar v_reduce_##func(const v_int8x16& a) \
1634 { \
1635 __m128i val = a.val; \
1636 __m128i smask = _mm_set1_epi8((schar)-128); \
1637 val = _mm_xor_si128(val, smask); \
1638 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1639 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1640 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1641 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1642 return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1643 } \
1644 inline uchar v_reduce_##func(const v_uint8x16& a) \
1645 { \
1646 __m128i val = a.val; \
1647 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1648 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1649 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1650 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1651 return (uchar)_mm_cvtsi128_si32(val); \
1652 }
1653OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
1654OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
1655
1656 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1657 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1658 { \
1659 __m128i val = a.val; \
1660 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1661 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1662 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1663 return (scalartype)_mm_cvtsi128_si32(val); \
1664 } \
1665 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1666 { \
1667 __m128i val = a.val; \
1668 __m128i smask = _mm_set1_epi16(sbit); \
1669 val = _mm_xor_si128(val, smask); \
1670 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1671 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1672 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1673 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1674 }
1675OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1676OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1677
1678 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1679 inline scalartype v_reduce_sum(const _Tpvec& a) \
1680 { \
1681 regtype val = a.val; \
1682 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1683 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1684 return (scalartype)_mm_cvt##extract(val); \
1685 }
1686
1687 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1688 inline scalartype v_reduce_##func(const _Tpvec& a) \
1689 { \
1690 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1691 v_store_aligned(buf, a); \
1692 scalartype s0 = scalar_func(buf[0], buf[1]); \
1693 scalartype s1 = scalar_func(buf[2], buf[3]); \
1694 return scalar_func(s0, s1); \
1695 }
1696
1697OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1698OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1699OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1700
1701 inline int v_reduce_sum(const v_int16x8& a)
1702{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1703 inline unsigned v_reduce_sum(const v_uint16x8& a)
1704{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1705
1706 inline uint64 v_reduce_sum(const v_uint64x2& a)
1707{
1708 uint64 CV_DECL_ALIGNED(32) idx[2];
1709 v_store_aligned(idx, a);
1710 return idx[0] + idx[1];
1711}
1712inline int64 v_reduce_sum(const v_int64x2& a)
1713{
1714 int64 CV_DECL_ALIGNED(32) idx[2];
1715 v_store_aligned(idx, a);
1716 return idx[0] + idx[1];
1717}
1718inline double v_reduce_sum(const v_float64x2& a)
1719{
1720 double CV_DECL_ALIGNED(32) idx[2];
1721 v_store_aligned(idx, a);
1722 return idx[0] + idx[1];
1723}
1724
1725inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1726 const v_float32x4& c, const v_float32x4& d)
1727{
1728 #if CV_SSE3
1729 __m128 ab = _mm_hadd_ps(a.val, b.val);
1730 __m128 cd = _mm_hadd_ps(c.val, d.val);
1731 return v_float32x4(_mm_hadd_ps(ab, cd));
1732 #else
1733 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1734 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1735 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1736 #endif
1737}
1738
1739OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1740OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1741OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1742OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1743OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1744OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1745
1746 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1747{
1748 __m128i half = _mm_sad_epu8(a.val, b.val);
1749 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1750}
1751 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1752{
1753 __m128i half = _mm_set1_epi8(0x7f);
1754 half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1755 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1756}
1757 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1758{
1759 v_uint32x4 l, h;
1760 v_expand(v_absdiff(a, b), l, h);
1761 return v_reduce_sum(l + h);
1762}
1763 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1764{
1765 v_uint32x4 l, h;
1766 v_expand(v_absdiff(a, b), l, h);
1767 return v_reduce_sum(l + h);
1768}
1769 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1770{
1771 return v_reduce_sum(v_absdiff(a, b));
1772}
1773 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1774{
1775 return v_reduce_sum(v_absdiff(a, b));
1776}
1777 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1778{
1779 return v_reduce_sum(v_absdiff(a, b));
1780}
1781
1782 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1783{
1784 __m128i m1 = _mm_set1_epi32(0x55555555);
1785 __m128i m2 = _mm_set1_epi32(0x33333333);
1786 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1787 __m128i p = a.val;
1788 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1789 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1790 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1791 return v_uint8x16(p);
1792}
1793 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1794{
1795 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1796 p += v_rotate_right<1>(p);
1797 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1798}
1799 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1800{
1801 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1802 p += v_rotate_right<1>(p);
1803 p += v_rotate_right<2>(p);
1804 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1805}
1806 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1807{
1808 return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
1809}
1810 inline v_uint8x16 v_popcount(const v_int8x16& a)
1811{ return v_popcount(v_reinterpret_as_u8(a)); }
1812 inline v_uint16x8 v_popcount(const v_int16x8& a)
1813{ return v_popcount(v_reinterpret_as_u16(a)); }
1814 inline v_uint32x4 v_popcount(const v_int32x4& a)
1815{ return v_popcount(v_reinterpret_as_u32(a)); }
1816 inline v_uint64x2 v_popcount(const v_int64x2& a)
1817{ return v_popcount(v_reinterpret_as_u64(a)); }
1818
1819 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1820 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1821 inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1822 inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1823OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1824OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1825OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
1826OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
1827OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
1828OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
1829OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
1830OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
1831
1832 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1833 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1834 inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1835 inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
1836OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
1837OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
1838
1839 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1840 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1841 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1842 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1843 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1844 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1845 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1846 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1847 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1848 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1849
1850 #if CV_SSE4_1
1851 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1853 { \
1854 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1855 }
1856
1857OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1858OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1859OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1860OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1861OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1862OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1863 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1864 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1865OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1866OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1867
1868 #else // CV_SSE4_1
1869
1870 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1871 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1872 { \
1873 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1874 }
1875
1876OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1877OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1878OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1879OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1880OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1881OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1882 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1883 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1884OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1885OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1886 #endif
1887
1888 /* Expand */
1889 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1890 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1891 { \
1892 b0.val = intrin(a.val); \
1893 b1.val = __CV_CAT(intrin, _high)(a.val); \
1894 } \
1895 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1896 { return _Tpwvec(intrin(a.val)); } \
1897 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1898 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1899 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1900 { \
1901 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1902 return _Tpwvec(intrin(a)); \
1903 }
1904
1905OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
1906OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
1907OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
1908OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
1909OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
1910OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
1911
1912 #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1913 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1914 { \
1915 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
1916 return _Tpvec(intrin(a)); \
1917 }
1918
1919OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
1920OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
1921
1922 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1923 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1924 { \
1925 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1926 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1927 } \
1928 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1929 { \
1930 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1931 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1932 } \
1933 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1934 { \
1935 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1936 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1937 } \
1938 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1939 { \
1940 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1941 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1942 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1943 }
1944
1945OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1946OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1947OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1948OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1949OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1950OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1951OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1952OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1953
1954 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1955{
1956 #if CV_SSSE3
1957 static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1958 return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1959 #else
1960 uchar CV_DECL_ALIGNED(32) d[16];
1961 v_store_aligned(d, a);
1962 return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1963 #endif
1964}
1965
1966 inline v_int8x16 v_reverse(const v_int8x16 &a)
1967{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1968
1969 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1970{
1971 #if CV_SSSE3
1972 static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1973 return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1974 #else
1975 __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1976 r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1977 r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1978 return v_uint16x8(r);
1979 #endif
1980}
1981
1982 inline v_int16x8 v_reverse(const v_int16x8 &a)
1983{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1984
1985 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1986{
1987 return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1988}
1989
1990 inline v_int32x4 v_reverse(const v_int32x4 &a)
1991{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1992
1993 inline v_float32x4 v_reverse(const v_float32x4 &a)
1994{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1995
1996 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1997{
1998 return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
1999}
2000
2001 inline v_int64x2 v_reverse(const v_int64x2 &a)
2002{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
2003
2004 inline v_float64x2 v_reverse(const v_float64x2 &a)
2005{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
2006
2007 template<int s, typename _Tpvec>
2008 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
2009{
2010 return v_rotate_right<s>(a, b);
2011}
2012
2013 inline v_int32x4 v_round(const v_float32x4& a)
2014{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
2015
2016 inline v_int32x4 v_floor(const v_float32x4& a)
2017{
2018 __m128i a1 = _mm_cvtps_epi32(a.val);
2019 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2020 return v_int32x4(_mm_add_epi32(a1, mask));
2021}
2022
2023 inline v_int32x4 v_ceil(const v_float32x4& a)
2024{
2025 __m128i a1 = _mm_cvtps_epi32(a.val);
2026 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2027 return v_int32x4(_mm_sub_epi32(a1, mask));
2028}
2029
2030 inline v_int32x4 v_trunc(const v_float32x4& a)
2031{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
2032
2033 inline v_int32x4 v_round(const v_float64x2& a)
2034{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2035
2036 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2037{
2038 __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2039 return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2040}
2041
2042 inline v_int32x4 v_floor(const v_float64x2& a)
2043{
2044 __m128i a1 = _mm_cvtpd_epi32(a.val);
2045 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2046 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2047 return v_int32x4(_mm_add_epi32(a1, mask));
2048}
2049
2050 inline v_int32x4 v_ceil(const v_float64x2& a)
2051{
2052 __m128i a1 = _mm_cvtpd_epi32(a.val);
2053 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2054 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2055 return v_int32x4(_mm_sub_epi32(a1, mask));
2056}
2057
2058 inline v_int32x4 v_trunc(const v_float64x2& a)
2059{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2060
2061 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2062 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2063 const _Tpvec& a2, const _Tpvec& a3, \
2064 _Tpvec& b0, _Tpvec& b1, \
2065 _Tpvec& b2, _Tpvec& b3) \
2066 { \
2067 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2068 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2069 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2070 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2071 \
2072 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2073 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2074 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2075 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2076 }
2077
2078OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2079OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2080OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2081
2082 // load deinterleave
2083 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2084{
2085 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2086 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2087
2088 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2089 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2090
2091 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2092 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2093
2094 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2095 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2096
2097 a.val = _mm_unpacklo_epi8(t30, t31);
2098 b.val = _mm_unpackhi_epi8(t30, t31);
2099}
2100
2101 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2102{
2103 #if CV_SSE4_1
2104 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2105 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2106 __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
2107 __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2108 __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2109 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2110 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2111 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2112 const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2113 const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2114 const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2115 a0 = _mm_shuffle_epi8(a0, sh_b);
2116 b0 = _mm_shuffle_epi8(b0, sh_g);
2117 c0 = _mm_shuffle_epi8(c0, sh_r);
2118 a.val = a0;
2119 b.val = b0;
2120 c.val = c0;
2121 #elif CV_SSSE3
2122 const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2123 const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2124 const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2125
2126 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2127 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2128 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2129
2130 __m128i s0 = _mm_shuffle_epi8(t0, m0);
2131 __m128i s1 = _mm_shuffle_epi8(t1, m1);
2132 __m128i s2 = _mm_shuffle_epi8(t2, m2);
2133
2134 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2135 a.val = _mm_alignr_epi8(s2, t0, 5);
2136
2137 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2138 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2139
2140 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2141 c.val = _mm_alignr_epi8(t2, s0, 11);
2142 #else
2143 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2144 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2145 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2146
2147 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2148 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2149 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2150
2151 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2152 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2153 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2154
2155 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2156 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2157 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2158
2159 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2160 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2161 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2162 #endif
2163}
2164
2165 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2166{
2167 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2168 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2169 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
2170 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
2171
2172 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
2173 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
2174 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
2175 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
2176
2177 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
2178 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
2179 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
2180 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
2181
2182 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
2183 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
2184 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
2185 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
2186
2187 a.val = _mm_unpacklo_epi8(v0, v1);
2188 b.val = _mm_unpackhi_epi8(v0, v1);
2189 c.val = _mm_unpacklo_epi8(v2, v3);
2190 d.val = _mm_unpackhi_epi8(v2, v3);
2191}
2192
2193 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2194{
2195 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
2196 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
2197
2198 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
2199 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
2200 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
2201 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
2202
2203 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
2204 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
2205}
2206
2207 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2208{
2209 #if CV_SSE4_1
2210 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2211 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2212 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2213 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2214 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2215 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2216
2217 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2218 const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2219 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2220 a0 = _mm_shuffle_epi8(a0, sh_a);
2221 b0 = _mm_shuffle_epi8(b0, sh_b);
2222 c0 = _mm_shuffle_epi8(c0, sh_c);
2223
2224 a.val = a0;
2225 b.val = b0;
2226 c.val = c0;
2227 #else
2228 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2229 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2230 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2231
2232 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2233 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2234 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2235
2236 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2237 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2238 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2239
2240 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2241 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2242 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2243 #endif
2244}
2245
2246 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2247{
2248 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2249 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
2250 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2251 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
2252
2253 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
2254 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
2255 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
2256 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
2257
2258 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
2259 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
2260 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
2261 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
2262
2263 a.val = _mm_unpacklo_epi16(u0, u1);
2264 b.val = _mm_unpackhi_epi16(u0, u1);
2265 c.val = _mm_unpacklo_epi16(u2, u3);
2266 d.val = _mm_unpackhi_epi16(u2, u3);
2267}
2268
2269 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2270{
2271 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
2272 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
2273
2274 __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
2275 __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
2276
2277 a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
2278 b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
2279}
2280
2281 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2282{
2283 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2284 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
2285 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2286
2287 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2288 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2289 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2290
2291 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2292 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2293 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2294}
2295
2296 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2297{
2298 v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
2299 v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
2300 v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
2301 v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
2302
2303 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2304}
2305
2306 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2307{
2308 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
2309 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
2310
2311 a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
2312 b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
2313}
2314
2315 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2316{
2317 __m128 t0 = _mm_loadu_ps(ptr + 0);
2318 __m128 t1 = _mm_loadu_ps(ptr + 4);
2319 __m128 t2 = _mm_loadu_ps(ptr + 8);
2320
2321 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2322 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2323
2324 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2325 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2326 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2327
2328 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2329 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2330}
2331
2332 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2333{
2334 __m128 t0 = _mm_loadu_ps(ptr + 0);
2335 __m128 t1 = _mm_loadu_ps(ptr + 4);
2336 __m128 t2 = _mm_loadu_ps(ptr + 8);
2337 __m128 t3 = _mm_loadu_ps(ptr + 12);
2338 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2339 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2340 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2341 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2342 a.val = _mm_unpacklo_ps(t02lo, t13lo);
2343 b.val = _mm_unpackhi_ps(t02lo, t13lo);
2344 c.val = _mm_unpacklo_ps(t02hi, t13hi);
2345 d.val = _mm_unpackhi_ps(t02hi, t13hi);
2346}
2347
2348 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2349{
2350 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2351 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
2352
2353 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2354 b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
2355}
2356
2357 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2358{
2359 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
2360 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
2361 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
2362
2363 t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
2364
2365 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2366 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2367 c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
2368}
2369
2370 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2371 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2372{
2373 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
2374 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
2375 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
2376 __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
2377
2378 a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
2379 b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
2380 c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
2381 d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
2382}
2383
2384 // store interleave
2385
2386 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2387 hal::StoreMode mode = hal::STORE_UNALIGNED)
2388{
2389 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2390 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2391
2392 if( mode == hal::STORE_ALIGNED_NOCACHE )
2393 {
2394 _mm_stream_si128((__m128i*)(ptr), v0);
2395 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2396 }
2397 else if( mode == hal::STORE_ALIGNED )
2398 {
2399 _mm_store_si128((__m128i*)(ptr), v0);
2400 _mm_store_si128((__m128i*)(ptr + 16), v1);
2401 }
2402 else
2403 {
2404 _mm_storeu_si128((__m128i*)(ptr), v0);
2405 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2406 }
2407}
2408
2409 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2410 const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2411{
2412 #if CV_SSE4_1
2413 const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2414 const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2415 const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2416 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2417 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2418 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2419
2420 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2421 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2422 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2423 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2424 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2425 #elif CV_SSSE3
2426 const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2427 const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2428 const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2429
2430 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2431 t0 = _mm_alignr_epi8(c.val, t0, 5);
2432 __m128i v0 = _mm_shuffle_epi8(t0, m0);
2433
2434 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2435 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2436 __m128i v1 = _mm_shuffle_epi8(t1, m1);
2437
2438 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2439 t2 = _mm_alignr_epi8(t2, a.val, 11);
2440 __m128i v2 = _mm_shuffle_epi8(t2, m2);
2441 #else
2442 __m128i z = _mm_setzero_si128();
2443 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2444 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2445 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2446 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2447
2448 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2449 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2450 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2451 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2452
2453 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2454 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2455 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2456 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2457
2458 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2459 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2460 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2461 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2462
2463 p20 = _mm_slli_si128(p20, 1);
2464 p22 = _mm_slli_si128(p22, 1);
2465
2466 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2467 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2468 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2469 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2470
2471 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2472 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2473 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2474 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2475
2476 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2477 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2478 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2479 #endif
2480
2481 if( mode == hal::STORE_ALIGNED_NOCACHE )
2482 {
2483 _mm_stream_si128((__m128i*)(ptr), v0);
2484 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2485 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2486 }
2487 else if( mode == hal::STORE_ALIGNED )
2488 {
2489 _mm_store_si128((__m128i*)(ptr), v0);
2490 _mm_store_si128((__m128i*)(ptr + 16), v1);
2491 _mm_store_si128((__m128i*)(ptr + 32), v2);
2492 }
2493 else
2494 {
2495 _mm_storeu_si128((__m128i*)(ptr), v0);
2496 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2497 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2498 }
2499}
2500
2501 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2502 const v_uint8x16& c, const v_uint8x16& d,
2503 hal::StoreMode mode = hal::STORE_UNALIGNED)
2504{
2505 // a0 a1 a2 a3 ....
2506 // b0 b1 b2 b3 ....
2507 // c0 c1 c2 c3 ....
2508 // d0 d1 d2 d3 ....
2509 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2510 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2511 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2512 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2513
2514 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2515 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2516 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2517 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2518
2519 if( mode == hal::STORE_ALIGNED_NOCACHE )
2520 {
2521 _mm_stream_si128((__m128i*)(ptr), v0);
2522 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2523 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2524 _mm_stream_si128((__m128i*)(ptr + 48), v3);
2525 }
2526 else if( mode == hal::STORE_ALIGNED )
2527 {
2528 _mm_store_si128((__m128i*)(ptr), v0);
2529 _mm_store_si128((__m128i*)(ptr + 16), v1);
2530 _mm_store_si128((__m128i*)(ptr + 32), v2);
2531 _mm_store_si128((__m128i*)(ptr + 48), v3);
2532 }
2533 else
2534 {
2535 _mm_storeu_si128((__m128i*)(ptr), v0);
2536 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2537 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2538 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2539 }
2540}
2541
2542 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2543 hal::StoreMode mode = hal::STORE_UNALIGNED)
2544{
2545 __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2546 __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2547
2548 if( mode == hal::STORE_ALIGNED_NOCACHE )
2549 {
2550 _mm_stream_si128((__m128i*)(ptr), v0);
2551 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2552 }
2553 else if( mode == hal::STORE_ALIGNED )
2554 {
2555 _mm_store_si128((__m128i*)(ptr), v0);
2556 _mm_store_si128((__m128i*)(ptr + 8), v1);
2557 }
2558 else
2559 {
2560 _mm_storeu_si128((__m128i*)(ptr), v0);
2561 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2562 }
2563}
2564
2565 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2566 const v_uint16x8& b, const v_uint16x8& c,
2567 hal::StoreMode mode = hal::STORE_UNALIGNED)
2568{
2569 #if CV_SSE4_1
2570 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2571 const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2572 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2573 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2574 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2575 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2576
2577 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2578 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2579 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2580 #else
2581 __m128i z = _mm_setzero_si128();
2582 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2583 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2584 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2585 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2586
2587 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2588 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2589 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2590 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2591
2592 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2593 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2594 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2595 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2596
2597 p20 = _mm_slli_si128(p20, 2);
2598 p22 = _mm_slli_si128(p22, 2);
2599
2600 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2601 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2602 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2603 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2604
2605 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2606 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2607 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2608 #endif
2609 if( mode == hal::STORE_ALIGNED_NOCACHE )
2610 {
2611 _mm_stream_si128((__m128i*)(ptr), v0);
2612 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2613 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2614 }
2615 else if( mode == hal::STORE_ALIGNED )
2616 {
2617 _mm_store_si128((__m128i*)(ptr), v0);
2618 _mm_store_si128((__m128i*)(ptr + 8), v1);
2619 _mm_store_si128((__m128i*)(ptr + 16), v2);
2620 }
2621 else
2622 {
2623 _mm_storeu_si128((__m128i*)(ptr), v0);
2624 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2625 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2626 }
2627}
2628
2629 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2630 const v_uint16x8& c, const v_uint16x8& d,
2631 hal::StoreMode mode = hal::STORE_UNALIGNED)
2632{
2633 // a0 a1 a2 a3 ....
2634 // b0 b1 b2 b3 ....
2635 // c0 c1 c2 c3 ....
2636 // d0 d1 d2 d3 ....
2637 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2638 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2639 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2640 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2641
2642 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2643 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2644 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2645 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2646
2647 if( mode == hal::STORE_ALIGNED_NOCACHE )
2648 {
2649 _mm_stream_si128((__m128i*)(ptr), v0);
2650 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2651 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2652 _mm_stream_si128((__m128i*)(ptr + 24), v3);
2653 }
2654 else if( mode == hal::STORE_ALIGNED )
2655 {
2656 _mm_store_si128((__m128i*)(ptr), v0);
2657 _mm_store_si128((__m128i*)(ptr + 8), v1);
2658 _mm_store_si128((__m128i*)(ptr + 16), v2);
2659 _mm_store_si128((__m128i*)(ptr + 24), v3);
2660 }
2661 else
2662 {
2663 _mm_storeu_si128((__m128i*)(ptr), v0);
2664 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2665 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2666 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2667 }
2668}
2669
2670 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2671 hal::StoreMode mode = hal::STORE_UNALIGNED)
2672{
2673 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2674 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2675
2676 if( mode == hal::STORE_ALIGNED_NOCACHE )
2677 {
2678 _mm_stream_si128((__m128i*)(ptr), v0);
2679 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2680 }
2681 else if( mode == hal::STORE_ALIGNED )
2682 {
2683 _mm_store_si128((__m128i*)(ptr), v0);
2684 _mm_store_si128((__m128i*)(ptr + 4), v1);
2685 }
2686 else
2687 {
2688 _mm_storeu_si128((__m128i*)(ptr), v0);
2689 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2690 }
2691}
2692
2693 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2694 const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2695{
2696 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2697 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2698
2699 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2700 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2701 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2702
2703 if( mode == hal::STORE_ALIGNED_NOCACHE )
2704 {
2705 _mm_stream_si128((__m128i*)(ptr), v0);
2706 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2707 _mm_stream_si128((__m128i*)(ptr + 8), v2);
2708 }
2709 else if( mode == hal::STORE_ALIGNED )
2710 {
2711 _mm_store_si128((__m128i*)(ptr), v0);
2712 _mm_store_si128((__m128i*)(ptr + 4), v1);
2713 _mm_store_si128((__m128i*)(ptr + 8), v2);
2714 }
2715 else
2716 {
2717 _mm_storeu_si128((__m128i*)(ptr), v0);
2718 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2719 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2720 }
2721}
2722
2723 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2724 const v_uint32x4& c, const v_uint32x4& d,
2725 hal::StoreMode mode = hal::STORE_UNALIGNED)
2726{
2727 v_uint32x4 v0, v1, v2, v3;
2728 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2729
2730 if( mode == hal::STORE_ALIGNED_NOCACHE )
2731 {
2732 _mm_stream_si128((__m128i*)(ptr), v0.val);
2733 _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2734 _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2735 _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2736 }
2737 else if( mode == hal::STORE_ALIGNED )
2738 {
2739 _mm_store_si128((__m128i*)(ptr), v0.val);
2740 _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2741 _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2742 _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2743 }
2744 else
2745 {
2746 _mm_storeu_si128((__m128i*)(ptr), v0.val);
2747 _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2748 _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2749 _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2750 }
2751}
2752
2753 // 2-channel, float only
2754 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2755 hal::StoreMode mode = hal::STORE_UNALIGNED)
2756{
2757 __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2758 __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2759
2760 if( mode == hal::STORE_ALIGNED_NOCACHE )
2761 {
2762 _mm_stream_ps(ptr, v0);
2763 _mm_stream_ps(ptr + 4, v1);
2764 }
2765 else if( mode == hal::STORE_ALIGNED )
2766 {
2767 _mm_store_ps(ptr, v0);
2768 _mm_store_ps(ptr + 4, v1);
2769 }
2770 else
2771 {
2772 _mm_storeu_ps(ptr, v0);
2773 _mm_storeu_ps(ptr + 4, v1);
2774 }
2775}
2776
2777 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2778 const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2779{
2780 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2781 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2782 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2783 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2784 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2785 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2786 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2787 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2788 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2789
2790 if( mode == hal::STORE_ALIGNED_NOCACHE )
2791 {
2792 _mm_stream_ps(ptr, v0);
2793 _mm_stream_ps(ptr + 4, v1);
2794 _mm_stream_ps(ptr + 8, v2);
2795 }
2796 else if( mode == hal::STORE_ALIGNED )
2797 {
2798 _mm_store_ps(ptr, v0);
2799 _mm_store_ps(ptr + 4, v1);
2800 _mm_store_ps(ptr + 8, v2);
2801 }
2802 else
2803 {
2804 _mm_storeu_ps(ptr, v0);
2805 _mm_storeu_ps(ptr + 4, v1);
2806 _mm_storeu_ps(ptr + 8, v2);
2807 }
2808}
2809
2810 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2811 const v_float32x4& c, const v_float32x4& d,
2812 hal::StoreMode mode = hal::STORE_UNALIGNED)
2813{
2814 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2815 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2816 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2817 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2818 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2819 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2820 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2821 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2822
2823 if( mode == hal::STORE_ALIGNED_NOCACHE )
2824 {
2825 _mm_stream_ps(ptr, v0);
2826 _mm_stream_ps(ptr + 4, v1);
2827 _mm_stream_ps(ptr + 8, v2);
2828 _mm_stream_ps(ptr + 12, v3);
2829 }
2830 else if( mode == hal::STORE_ALIGNED )
2831 {
2832 _mm_store_ps(ptr, v0);
2833 _mm_store_ps(ptr + 4, v1);
2834 _mm_store_ps(ptr + 8, v2);
2835 _mm_store_ps(ptr + 12, v3);
2836 }
2837 else
2838 {
2839 _mm_storeu_ps(ptr, v0);
2840 _mm_storeu_ps(ptr + 4, v1);
2841 _mm_storeu_ps(ptr + 8, v2);
2842 _mm_storeu_ps(ptr + 12, v3);
2843 }
2844}
2845
2846 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2847 hal::StoreMode mode = hal::STORE_UNALIGNED)
2848{
2849 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2850 __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2851
2852 if( mode == hal::STORE_ALIGNED_NOCACHE )
2853 {
2854 _mm_stream_si128((__m128i*)(ptr), v0);
2855 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2856 }
2857 else if( mode == hal::STORE_ALIGNED )
2858 {
2859 _mm_store_si128((__m128i*)(ptr), v0);
2860 _mm_store_si128((__m128i*)(ptr + 2), v1);
2861 }
2862 else
2863 {
2864 _mm_storeu_si128((__m128i*)(ptr), v0);
2865 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2866 }
2867}
2868
2869 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2870 const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2871{
2872 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2873 __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2874 __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2875
2876 if( mode == hal::STORE_ALIGNED_NOCACHE )
2877 {
2878 _mm_stream_si128((__m128i*)(ptr), v0);
2879 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2880 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2881 }
2882 else if( mode == hal::STORE_ALIGNED )
2883 {
2884 _mm_store_si128((__m128i*)(ptr), v0);
2885 _mm_store_si128((__m128i*)(ptr + 2), v1);
2886 _mm_store_si128((__m128i*)(ptr + 4), v2);
2887 }
2888 else
2889 {
2890 _mm_storeu_si128((__m128i*)(ptr), v0);
2891 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2892 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2893 }
2894}
2895
2896 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2897 const v_uint64x2& c, const v_uint64x2& d,
2898 hal::StoreMode mode = hal::STORE_UNALIGNED)
2899{
2900 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2901 __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2902 __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2903 __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2904
2905 if( mode == hal::STORE_ALIGNED_NOCACHE )
2906 {
2907 _mm_stream_si128((__m128i*)(ptr), v0);
2908 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2909 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2910 _mm_stream_si128((__m128i*)(ptr + 6), v3);
2911 }
2912 else if( mode == hal::STORE_ALIGNED )
2913 {
2914 _mm_store_si128((__m128i*)(ptr), v0);
2915 _mm_store_si128((__m128i*)(ptr + 2), v1);
2916 _mm_store_si128((__m128i*)(ptr + 4), v2);
2917 _mm_store_si128((__m128i*)(ptr + 6), v3);
2918 }
2919 else
2920 {
2921 _mm_storeu_si128((__m128i*)(ptr), v0);
2922 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2923 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2924 _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2925 }
2926}
2927
2928 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2929 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2930 { \
2931 _Tpvec1 a1, b1; \
2932 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2933 a0 = v_reinterpret_as_##suffix0(a1); \
2934 b0 = v_reinterpret_as_##suffix0(b1); \
2935 } \
2936 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2937 { \
2938 _Tpvec1 a1, b1, c1; \
2939 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2940 a0 = v_reinterpret_as_##suffix0(a1); \
2941 b0 = v_reinterpret_as_##suffix0(b1); \
2942 c0 = v_reinterpret_as_##suffix0(c1); \
2943 } \
2944 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2945 { \
2946 _Tpvec1 a1, b1, c1, d1; \
2947 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2948 a0 = v_reinterpret_as_##suffix0(a1); \
2949 b0 = v_reinterpret_as_##suffix0(b1); \
2950 c0 = v_reinterpret_as_##suffix0(c1); \
2951 d0 = v_reinterpret_as_##suffix0(d1); \
2952 } \
2953 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2954 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2955 { \
2956 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2957 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2958 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2959 } \
2960 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2961 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2962 { \
2963 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2964 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2965 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2966 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2967 } \
2968 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2969 const _Tpvec0& c0, const _Tpvec0& d0, \
2970 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2971 { \
2972 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2973 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2974 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2975 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2976 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2977 }
2978
2979OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2980OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2981OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2982OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2983OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2984
2985 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2986{
2987 return v_float32x4(_mm_cvtepi32_ps(a.val));
2988}
2989
2990 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2991{
2992 return v_float32x4(_mm_cvtpd_ps(a.val));
2993}
2994
2995 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2996{
2997 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2998}
2999
3000 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
3001{
3002 return v_float64x2(_mm_cvtepi32_pd(a.val));
3003}
3004
3005 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
3006{
3007 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3008}
3009
3010 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
3011{
3012 return v_float64x2(_mm_cvtps_pd(a.val));
3013}
3014
3015 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
3016{
3017 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3018}
3019
3020 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
3021 inline v_float64x2 v_cvt_f64(const v_int64x2& v)
3022{
3023 // constants encoded as floating-point
3024 __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
3025 __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
3026 __m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3027 // Blend the 32 lowest significant bits of v with magic_int_lo
3028 #if CV_SSE4_1
3029 __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
3030 __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3031 #else
3032 __m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
3033 __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3034 #endif
3035 // Extract the 32 most significant bits of v
3036 __m128i v_hi = _mm_srli_epi64(v.val, 32);
3037 // Flip the msb of v_hi and blend with 0x45300000
3038 v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3039 // Compute in double precision
3040 __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3041 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
3042 __m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3043 return v_float64x2(result);
3044}
3045
3047
3048 inline v_int8x16 v_lut(const schar* tab, const int* idx)
3049{
3050 #if defined(_MSC_VER)
3051 return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3052 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3053 #else
3054 return v_int8x16(_mm_setr_epi64(
3055 _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3056 _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3057 ));
3058 #endif
3059}
3060 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
3061{
3062 #if defined(_MSC_VER)
3063 return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
3064 *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
3065 #else
3066 return v_int8x16(_mm_setr_epi64(
3067 _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
3068 _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
3069 ));
3070 #endif
3071}
3072 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
3073{
3074 #if defined(_MSC_VER)
3075 return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3076 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3077 #else
3078 return v_int8x16(_mm_setr_epi64(
3079 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3080 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3081 ));
3082 #endif
3083}
3084 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
3085 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
3086 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
3087
3088 inline v_int16x8 v_lut(const short* tab, const int* idx)
3089{
3090 #if defined(_MSC_VER)
3091 return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3092 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3093 #else
3094 return v_int16x8(_mm_setr_epi64(
3095 _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3096 _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3097 ));
3098 #endif
3099}
3100 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
3101{
3102 #if defined(_MSC_VER)
3103 return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3104 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3105 #else
3106 return v_int16x8(_mm_setr_epi64(
3107 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3108 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3109 ));
3110 #endif
3111}
3112 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
3113{
3114 return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3115}
3116 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
3117 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
3118 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
3119
3120 inline v_int32x4 v_lut(const int* tab, const int* idx)
3121{
3122 #if defined(_MSC_VER)
3123 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3124 tab[idx[2]], tab[idx[3]]));
3125 #else
3126 return v_int32x4(_mm_setr_epi64(
3127 _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3128 _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3129 ));
3130 #endif
3131}
3132 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
3133{
3134 return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3135}
3136 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
3137{
3138 return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3139}
3140 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
3141 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
3142 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
3143
3144 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
3145{
3146 return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3147}
3148 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
3149{
3150 return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3151}
3152 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
3153 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
3154
3155 inline v_float32x4 v_lut(const float* tab, const int* idx)
3156{
3157 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3158}
3159 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
3160 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
3161
3162 inline v_float64x2 v_lut(const double* tab, const int* idx)
3163{
3164 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3165}
3166 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
3167
3168 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
3169{
3170 int CV_DECL_ALIGNED(32) idx[4];
3171 v_store_aligned(idx, idxvec);
3172 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3173}
3174
3175inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
3176{
3177 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
3178}
3179
3180 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
3181{
3182 int CV_DECL_ALIGNED(32) idx[4];
3183 v_store_aligned(idx, idxvec);
3184 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3185}
3186
3187inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
3188{
3189 int idx[2];
3190 v_store_low(idx, idxvec);
3191 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3192}
3193
3194 // loads pairs from the table and deinterleaves them, e.g. returns:
3195 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
3196 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
3197 // note that the indices are float's indices, not the float-pair indices.
3198 // in theory, this function can be used to implement bilinear interpolation,
3199 // when idxvec are the offsets within the image.
3200 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
3201{
3202 int CV_DECL_ALIGNED(32) idx[4];
3203 v_store_aligned(idx, idxvec);
3204 __m128 z = _mm_setzero_ps();
3205 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3206 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3207 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3208 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3209 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3210 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3211 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
3212 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
3213}
3214
3215inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
3216{
3217 int idx[2];
3218 v_store_low(idx, idxvec);
3219 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3220 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3221 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
3222 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
3223}
3224
3225 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
3226{
3227 #if CV_SSSE3
3228 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3229 #else
3230 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3231 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3232 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3233 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3234 #endif
3235}
3236 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
3237 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
3238{
3239 #if CV_SSSE3
3240 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3241 #else
3242 __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3243 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3244 #endif
3245}
3246 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3247
3248 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
3249{
3250 #if CV_SSSE3
3251 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3252 #else
3253 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3254 return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3255 #endif
3256}
3257 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
3258 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
3259{
3260 #if CV_SSSE3
3261 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3262 #else
3263 return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3264 #endif
3265}
3266 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3267
3268 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
3269{
3270 return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3271}
3272 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3273 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3274
3275 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
3276{
3277 #if CV_SSSE3
3278 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3279 #else
3280 __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3281 __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3282 return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3283 #endif
3284}
3285 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3286
3287 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
3288{
3289 #if CV_SSSE3
3290 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3291 #else
3292 return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3293 #endif
3294}
3295 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3296
3297 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
3298 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
3299 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
3300
3301 template<int i>
3302 inline uchar v_extract_n(const v_uint8x16& v)
3303{
3304 #if CV_SSE4_1
3305 return (uchar)_mm_extract_epi8(v.val, i);
3306 #else
3307 return v_rotate_right<i>(v).get0();
3308 #endif
3309}
3310
3311 template<int i>
3312 inline schar v_extract_n(const v_int8x16& v)
3313{
3314 return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3315}
3316
3317 template<int i>
3318 inline ushort v_extract_n(const v_uint16x8& v)
3319{
3320 return (ushort)_mm_extract_epi16(v.val, i);
3321}
3322
3323 template<int i>
3324 inline short v_extract_n(const v_int16x8& v)
3325{
3326 return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
3327}
3328
3329 template<int i>
3330 inline uint v_extract_n(const v_uint32x4& v)
3331{
3332 #if CV_SSE4_1
3333 return (uint)_mm_extract_epi32(v.val, i);
3334 #else
3335 return v_rotate_right<i>(v).get0();
3336 #endif
3337}
3338
3339 template<int i>
3340 inline int v_extract_n(const v_int32x4& v)
3341{
3342 return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
3343}
3344
3345 template<int i>
3346 inline uint64 v_extract_n(const v_uint64x2& v)
3347{
3348 #ifdef CV__SIMD_NATIVE_mm_extract_epi64
3349 return (uint64)_v128_extract_epi64<i>(v.val);
3350 #else
3351 return v_rotate_right<i>(v).get0();
3352 #endif
3353}
3354
3355 template<int i>
3356 inline int64 v_extract_n(const v_int64x2& v)
3357{
3358 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3359}
3360
3361 template<int i>
3362 inline float v_extract_n(const v_float32x4& v)
3363{
3364 union { uint iv; float fv; } d;
3365 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3366 return d.fv;
3367}
3368
3369 template<int i>
3370 inline double v_extract_n(const v_float64x2& v)
3371{
3372 union { uint64 iv; double dv; } d;
3373 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3374 return d.dv;
3375}
3376
3377 template<int i>
3378 inline v_int32x4 v_broadcast_element(const v_int32x4& v)
3379{
3380 return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3381}
3382
3383 template<int i>
3385{
3386 return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3387}
3388
3389 template<int i>
3391{
3392 return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
3393}
3394
3396
3397 inline v_float32x4 v_load_expand(const float16_t* ptr)
3398{
3399 #if CV_FP16
3400 return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3401 #else
3402 const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3403 const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3404 const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3405 __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
3406 __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3407 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
3408 __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3409
3410 t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3411 __m128i zmask = _mm_cmpeq_epi32(e, z);
3412 __m128i ft = v_select_si128(zmask, zt, t);
3413 return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3414 #endif
3415}
3416
3417 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
3418{
3419 #if CV_FP16
3420 __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3421 _mm_storel_epi64((__m128i*)ptr, fp16_value);
3422 #else
3423 const __m128i signmask = _mm_set1_epi32(0x80000000);
3424 const __m128i rval = _mm_set1_epi32(0x3f000000);
3425
3426 __m128i t = _mm_castps_si128(v.val);
3427 __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3428 t = _mm_andnot_si128(signmask, t);
3429
3430 __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3431 __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3432 __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3433 __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3434 __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3435 tt = _mm_sub_epi32(tt, rval);
3436 __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3437 __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3438 nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3439 t = v_select_si128(tinymask, tt, nt);
3440 t = v_select_si128(finitemask, t, naninf);
3441 t = _mm_or_si128(t, sign);
3442 t = _mm_packs_epi32(t, t);
3443 _mm_storel_epi64((__m128i*)ptr, t);
3444 #endif
3445}
3446
3447 inline void v_cleanup() {}
3448
3449CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3450
3452
3453}
3454
3455 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition: intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition: intrin_cpp.hpp:2764
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition: intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition: intrin_cpp.hpp:2374
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75