OpenCV 4.5.3(日本語機械翻訳)
intrin_avx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
6 #define OPENCV_HAL_INTRIN_AVX_HPP
7
8 #define CV_SIMD256 1
9 #define CV_SIMD256_64F 1
10 #define CV_SIMD256_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
11
12 namespace cv
13{
14
16
17CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
18
20
21 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
22{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
23
24 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
25{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
26
27 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
28{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
29
30 inline int _v_cvtsi256_si32(const __m256i& a)
31{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
32
33 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
34{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
35
36 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
37{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
38
39 template<int imm>
40 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
41{ return _mm256_permute2x128_si256(a, b, imm); }
42
43 template<int imm>
44 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
45{ return _mm256_permute2f128_ps(a, b, imm); }
46
47 template<int imm>
48 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
49{ return _mm256_permute2f128_pd(a, b, imm); }
50
51 template<int imm, typename _Tpvec>
52 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
53{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
54
55 template<int imm>
56 inline __m256i _v256_permute4x64(const __m256i& a)
57{ return _mm256_permute4x64_epi64(a, imm); }
58
59 template<int imm>
60 inline __m256d _v256_permute4x64(const __m256d& a)
61{ return _mm256_permute4x64_pd(a, imm); }
62
63 template<int imm, typename _Tpvec>
64 inline _Tpvec v256_permute4x64(const _Tpvec& a)
65{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
66
67 inline __m128i _v256_extract_high(const __m256i& v)
68{ return _mm256_extracti128_si256(v, 1); }
69
70 inline __m128 _v256_extract_high(const __m256& v)
71{ return _mm256_extractf128_ps(v, 1); }
72
73 inline __m128d _v256_extract_high(const __m256d& v)
74{ return _mm256_extractf128_pd(v, 1); }
75
76 inline __m128i _v256_extract_low(const __m256i& v)
77{ return _mm256_castsi256_si128(v); }
78
79 inline __m128 _v256_extract_low(const __m256& v)
80{ return _mm256_castps256_ps128(v); }
81
82 inline __m128d _v256_extract_low(const __m256d& v)
83{ return _mm256_castpd256_pd128(v); }
84
85 inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
86{
87 const __m256i m = _mm256_set1_epi32(65535);
88 __m256i am = _mm256_min_epu32(a, m);
89 __m256i bm = _mm256_min_epu32(b, m);
90 return _mm256_packus_epi32(am, bm);
91}
92
93 template<int i>
94 inline int _v256_extract_epi8(const __m256i& a)
95{
96 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910 /*MSVS 2017*/ ))
97 return _mm256_extract_epi8(a, i);
98 #else
99 __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
100 return _mm_extract_epi8(b, i & 15); // SSE4.1
101 #endif
102}
103
104 template<int i>
105 inline int _v256_extract_epi16(const __m256i& a)
106{
107 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910 /*MSVS 2017*/ ))
108 return _mm256_extract_epi16(a, i);
109 #else
110 __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
111 return _mm_extract_epi16(b, i & 7); // SSE2
112 #endif
113}
114
115 template<int i>
116 inline int _v256_extract_epi32(const __m256i& a)
117{
118 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910 /*MSVS 2017*/ ))
119 return _mm256_extract_epi32(a, i);
120 #else
121 __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
122 return _mm_extract_epi32(b, i & 3); // SSE4.1
123 #endif
124}
125
126 template<int i>
127 inline int64 _v256_extract_epi64(const __m256i& a)
128{
129 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910 /*MSVS 2017*/ ))
130 return _mm256_extract_epi64(a, i);
131 #else
132 __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
133 return _mm_extract_epi64(b, i & 1); // SSE4.1
134 #endif
135}
136
138
139 struct v_uint8x32
140{
141 typedef uchar lane_type;
142 enum { nlanes = 32 };
143 __m256i val;
144
145 explicit v_uint8x32(__m256i v) : val(v) {}
146 v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
147 uchar v4, uchar v5, uchar v6, uchar v7,
148 uchar v8, uchar v9, uchar v10, uchar v11,
149 uchar v12, uchar v13, uchar v14, uchar v15,
150 uchar v16, uchar v17, uchar v18, uchar v19,
151 uchar v20, uchar v21, uchar v22, uchar v23,
152 uchar v24, uchar v25, uchar v26, uchar v27,
153 uchar v28, uchar v29, uchar v30, uchar v31)
154 {
155 val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
156 (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
157 (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
158 (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
159 (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
160 (char)v28, (char)v29, (char)v30, (char)v31);
161 }
162 /* coverity[uninit_ctor]: suppress warning */
163 v_uint8x32() {}
164
165 uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
166};
167
168 struct v_int8x32
169{
170 typedef schar lane_type;
171 enum { nlanes = 32 };
172 __m256i val;
173
174 explicit v_int8x32(__m256i v) : val(v) {}
175 v_int8x32(schar v0, schar v1, schar v2, schar v3,
176 schar v4, schar v5, schar v6, schar v7,
177 schar v8, schar v9, schar v10, schar v11,
178 schar v12, schar v13, schar v14, schar v15,
179 schar v16, schar v17, schar v18, schar v19,
180 schar v20, schar v21, schar v22, schar v23,
181 schar v24, schar v25, schar v26, schar v27,
182 schar v28, schar v29, schar v30, schar v31)
183 {
184 val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
185 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
186 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
187 }
188 /* coverity[uninit_ctor]: suppress warning */
189 v_int8x32() {}
190
191 schar get0() const { return (schar)_v_cvtsi256_si32(val); }
192};
193
194 struct v_uint16x16
195{
196 typedef ushort lane_type;
197 enum { nlanes = 16 };
198 __m256i val;
199
200 explicit v_uint16x16(__m256i v) : val(v) {}
201 v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
202 ushort v4, ushort v5, ushort v6, ushort v7,
203 ushort v8, ushort v9, ushort v10, ushort v11,
204 ushort v12, ushort v13, ushort v14, ushort v15)
205 {
206 val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
207 (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
208 (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
209 }
210 /* coverity[uninit_ctor]: suppress warning */
211 v_uint16x16() {}
212
213 ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
214};
215
216 struct v_int16x16
217{
218 typedef short lane_type;
219 enum { nlanes = 16 };
220 __m256i val;
221
222 explicit v_int16x16(__m256i v) : val(v) {}
223 v_int16x16(short v0, short v1, short v2, short v3,
224 short v4, short v5, short v6, short v7,
225 short v8, short v9, short v10, short v11,
226 short v12, short v13, short v14, short v15)
227 {
228 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
229 v8, v9, v10, v11, v12, v13, v14, v15);
230 }
231 /* coverity[uninit_ctor]: suppress warning */
232 v_int16x16() {}
233
234 short get0() const { return (short)_v_cvtsi256_si32(val); }
235};
236
237 struct v_uint32x8
238{
239 typedef unsigned lane_type;
240 enum { nlanes = 8 };
241 __m256i val;
242
243 explicit v_uint32x8(__m256i v) : val(v) {}
244 v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
245 unsigned v4, unsigned v5, unsigned v6, unsigned v7)
246 {
247 val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
248 (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
249 }
250 /* coverity[uninit_ctor]: suppress warning */
251 v_uint32x8() {}
252
253 unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
254};
255
256 struct v_int32x8
257{
258 typedef int lane_type;
259 enum { nlanes = 8 };
260 __m256i val;
261
262 explicit v_int32x8(__m256i v) : val(v) {}
263 v_int32x8(int v0, int v1, int v2, int v3,
264 int v4, int v5, int v6, int v7)
265 {
266 val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
267 }
268 /* coverity[uninit_ctor]: suppress warning */
269 v_int32x8() {}
270
271 int get0() const { return _v_cvtsi256_si32(val); }
272};
273
274 struct v_float32x8
275{
276 typedef float lane_type;
277 enum { nlanes = 8 };
278 __m256 val;
279
280 explicit v_float32x8(__m256 v) : val(v) {}
281 v_float32x8(float v0, float v1, float v2, float v3,
282 float v4, float v5, float v6, float v7)
283 {
284 val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
285 }
286 /* coverity[uninit_ctor]: suppress warning */
287 v_float32x8() {}
288
289 float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
290};
291
292 struct v_uint64x4
293{
294 typedef uint64 lane_type;
295 enum { nlanes = 4 };
296 __m256i val;
297
298 explicit v_uint64x4(__m256i v) : val(v) {}
299 v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
300 { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
301 /* coverity[uninit_ctor]: suppress warning */
302 v_uint64x4() {}
303
304 uint64 get0() const
305 {
306 #if defined __x86_64__ || defined _M_X64
307 return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
308 #else
309 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
310 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
311 return (unsigned)a | ((uint64)(unsigned)b << 32);
312 #endif
313 }
314};
315
316 struct v_int64x4
317{
318 typedef int64 lane_type;
319 enum { nlanes = 4 };
320 __m256i val;
321
322 explicit v_int64x4(__m256i v) : val(v) {}
323 v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
324 { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
325 /* coverity[uninit_ctor]: suppress warning */
326 v_int64x4() {}
327
328 int64 get0() const
329 {
330 #if defined __x86_64__ || defined _M_X64
331 return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
332 #else
333 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
334 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
335 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
336 #endif
337 }
338};
339
340 struct v_float64x4
341{
342 typedef double lane_type;
343 enum { nlanes = 4 };
344 __m256d val;
345
346 explicit v_float64x4(__m256d v) : val(v) {}
347 v_float64x4(double v0, double v1, double v2, double v3)
348 { val = _mm256_setr_pd(v0, v1, v2, v3); }
349 /* coverity[uninit_ctor]: suppress warning */
350 v_float64x4() {}
351
352 double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
353};
354
356
357 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
358 inline _Tpvec v256_load(const _Tp* ptr) \
359 { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
360 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
361 { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
362 inline _Tpvec v256_load_low(const _Tp* ptr) \
363 { \
364 __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
365 return _Tpvec(_mm256_castsi128_si256(v128)); \
366 } \
367 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
368 { \
369 __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
370 __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
371 return _Tpvec(_v256_combine(vlo, vhi)); \
372 } \
373 inline void v_store(_Tp* ptr, const _Tpvec& a) \
374 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
375 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
376 { _mm256_store_si256((__m256i*)ptr, a.val); } \
377 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
378 { _mm256_stream_si256((__m256i*)ptr, a.val); } \
379 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
380 { \
381 if( mode == hal::STORE_UNALIGNED ) \
382 _mm256_storeu_si256((__m256i*)ptr, a.val); \
383 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
384 _mm256_stream_si256((__m256i*)ptr, a.val); \
385 else \
386 _mm256_store_si256((__m256i*)ptr, a.val); \
387 } \
388 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
389 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
390 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
391 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
392
393OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
394OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
395OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
396OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
397OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
398OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
399OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
400OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
401
402 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
403 inline _Tpvec v256_load(const _Tp* ptr) \
404 { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
405 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
406 { return _Tpvec(_mm256_load_##suffix(ptr)); } \
407 inline _Tpvec v256_load_low(const _Tp* ptr) \
408 { \
409 return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
410 (_mm_loadu_##suffix(ptr))); \
411 } \
412 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
413 { \
414 halfreg vlo = _mm_loadu_##suffix(ptr0); \
415 halfreg vhi = _mm_loadu_##suffix(ptr1); \
416 return _Tpvec(_v256_combine(vlo, vhi)); \
417 } \
418 inline void v_store(_Tp* ptr, const _Tpvec& a) \
419 { _mm256_storeu_##suffix(ptr, a.val); } \
420 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
421 { _mm256_store_##suffix(ptr, a.val); } \
422 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
423 { _mm256_stream_##suffix(ptr, a.val); } \
424 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
425 { \
426 if( mode == hal::STORE_UNALIGNED ) \
427 _mm256_storeu_##suffix(ptr, a.val); \
428 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
429 _mm256_stream_##suffix(ptr, a.val); \
430 else \
431 _mm256_store_##suffix(ptr, a.val); \
432 } \
433 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
434 { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
435 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
436 { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
437
438OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
439OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
440
441 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
442 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
443 { return _Tpvec(cast(a.val)); }
444
445 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
446 inline _Tpvec v256_setzero_##suffix() \
447 { return _Tpvec(_mm256_setzero_si256()); } \
448 inline _Tpvec v256_setall_##suffix(_Tp v) \
449 { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
450 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
451 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
452 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
453 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
454 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
455 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
456 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
457 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
458 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
459 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
460
461OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
462OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
463OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
464OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
465OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
466OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
467OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
468OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
469
470 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
471 inline _Tpvec v256_setzero_##suffix() \
472 { return _Tpvec(_mm256_setzero_##zsuffix()); } \
473 inline _Tpvec v256_setall_##suffix(_Tp v) \
474 { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
475 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
476 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
477 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
478 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
479 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
480 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
481 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
482 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
483
484OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
485OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
486
487 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
488{ return a; }
489 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
490{ return v_float32x8(_mm256_castpd_ps(a.val)); }
491
492 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
493{ return a; }
494 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
495{ return v_float64x4(_mm256_castps_pd(a.val)); }
496
497 /* Recombine */
498 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
499 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
500 { return _Tpvec(perm(a.val, b.val, 0x20)); } \
501 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
502 { return _Tpvec(perm(a.val, b.val, 0x31)); } \
503 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
504 _Tpvec& c, _Tpvec& d) \
505 { c = v_combine_low(a, b); d = v_combine_high(a, b); }
506
507 #define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
508 OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
509 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
510 _Tpvec& b0, _Tpvec& b1) \
511 { \
512 __m256i v0 = _v256_shuffle_odd_64(a0.val); \
513 __m256i v1 = _v256_shuffle_odd_64(a1.val); \
514 b0.val = _mm256_unpacklo_##suffix(v0, v1); \
515 b1.val = _mm256_unpackhi_##suffix(v0, v1); \
516 }
517
518 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
519 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
520 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
521 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
522 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
523 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
524 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
525 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
526 OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
527 OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
528
529 inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
530 {
531 __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
532 __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
533 v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
534 }
535
536 inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
537 {
538 __m256d v0 = _v_shuffle_odd_64(a0.val);
539 __m256d v1 = _v_shuffle_odd_64(a1.val);
540 b0.val = _mm256_unpacklo_pd(v0, v1);
541 b1.val = _mm256_unpackhi_pd(v0, v1);
542 }*/
543
545
546 // unpacks
547 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
548 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
549 { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
550 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
551 { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
552
553OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
554OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
555OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
556OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
557OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
558OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
559OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
560OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
561OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
562OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
563
564 // blend
565 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
566 template<int m> \
567 inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
568 { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
569
570OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
571OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
572OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
573OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
574OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
575OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
576
577 template<int m>
578 inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
579{
580 enum {M0 = m};
581 enum {M1 = (M0 | (M0 << 2)) & 0x33};
582 enum {M2 = (M1 | (M1 << 1)) & 0x55};
583 enum {MM = M2 | (M2 << 1)};
584 return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
585}
586 template<int m>
587 inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
588{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
589
590 // shuffle
591 // todo: emulate 64bit
592 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
593 template<int m> \
594 inline _Tpvec v256_shuffle(const _Tpvec& a) \
595 { return _Tpvec(_mm256_##intrin(a.val, m)); }
596
597OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
598OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
599OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
600OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
601
602 template<typename _Tpvec>
603 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
604{
605 ab0 = v256_unpacklo(a, b);
606 ab1 = v256_unpackhi(a, b);
607}
608
609 template<typename _Tpvec>
610 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
611{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
612
613 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
614{ return v256_blend<0xf0>(a, b); }
615
616 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
617{ return v256_blend<0xc>(a, b); }
618
619 template<typename _Tpvec>
620 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
621{ return v256_permute2x128<0x21>(a, b); }
622
623 template<typename _Tpvec>
624 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
625{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
626 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
627{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
628 // todo: emulate float32
629
630 template<typename _Tpvec>
631 inline _Tpvec v256_swap_halves(const _Tpvec& a)
632{ return v256_permute2x128<1>(a, a); }
633
634 template<typename _Tpvec>
635 inline _Tpvec v256_reverse_64(const _Tpvec& a)
636{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
637
638 // ZIP
639 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
640 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
641 { return v256_permute2x128<0x20>(a, b); } \
642 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
643 { return v256_permute2x128<0x31>(a, b); } \
644 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
645 _Tpvec& c, _Tpvec& d) \
646 { \
647 _Tpvec a1b0 = v256_alignr_128(a, b); \
648 c = v256_combine_diagonal(a, a1b0); \
649 d = v256_combine_diagonal(a1b0, b); \
650 } \
651 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
652 _Tpvec& ab0, _Tpvec& ab1) \
653 { \
654 _Tpvec ab0ab2, ab1ab3; \
655 v256_zip(a, b, ab0ab2, ab1ab3); \
656 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
657 }
658
659OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
660OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
661OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
662OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
663OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
664OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
665OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
666OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
667OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
668OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
669
670
671
672 /* Element-wise binary and unary operations */
673
674
675 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
676 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
677 { return _Tpvec(intrin(a.val, b.val)); } \
678 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
679 { a.val = intrin(a.val, b.val); return a; }
680
681OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
682OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
683OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
684OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
685OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
686OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
687OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
688OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
689OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
690OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
691OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
692OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
693OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
694OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
695OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
696OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
697OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
698OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
699
700OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
701OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
702OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
703OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
704OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
705OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
706OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
707OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
708
709 // saturating multiply 8-bit, 16-bit
710 inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
711{
712 v_uint16x16 c, d;
713 v_mul_expand(a, b, c, d);
714 return v_pack(c, d);
715}
716 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
717{
718 v_int16x16 c, d;
719 v_mul_expand(a, b, c, d);
720 return v_pack(c, d);
721}
722 inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
723{
724 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
725 __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
726 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
727 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
728 return v_uint16x16(_v256_packs_epu32(p0, p1));
729}
730 inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
731{
732 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
733 __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
734 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
735 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
736 return v_int16x16(_mm256_packs_epi32(p0, p1));
737}
738 inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
739{ a = a * b; return a; }
740 inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
741{ a = a * b; return a; }
742 inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
743{ a = a * b; return a; }
744 inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
745{ a = a * b; return a; }
746
748 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
749 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
750 { return _Tpvec(intrin(a.val, b.val)); }
751
752OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
753OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
754OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
755OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
756OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
757OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
758OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
759OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
760OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
761OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
762
763 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
764{
765 __m256i ad = _mm256_srai_epi16(a.val, 8);
766 __m256i bd = _mm256_srai_epi16(b.val, 8);
767 __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
768 __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
769
770 const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
771 return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
772}
773 inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
774{
775 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
776}
777
778 // Multiply and expand
779 inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
780 v_uint16x16& c, v_uint16x16& d)
781{
782 v_uint16x16 a0, a1, b0, b1;
783 v_expand(a, a0, a1);
784 v_expand(b, b0, b1);
785 c = v_mul_wrap(a0, b0);
786 d = v_mul_wrap(a1, b1);
787}
788
789 inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
790 v_int16x16& c, v_int16x16& d)
791{
792 v_int16x16 a0, a1, b0, b1;
793 v_expand(a, a0, a1);
794 v_expand(b, b0, b1);
795 c = v_mul_wrap(a0, b0);
796 d = v_mul_wrap(a1, b1);
797}
798
799 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
800 v_int32x8& c, v_int32x8& d)
801{
802 v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
803
804 v_int16x16 v0, v1;
805 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
806
807 c = v_reinterpret_as_s32(v0);
808 d = v_reinterpret_as_s32(v1);
809}
810
811 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
812 v_uint32x8& c, v_uint32x8& d)
813{
814 v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
815
816 v_uint16x16 v0, v1;
817 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
818
819 c = v_reinterpret_as_u32(v0);
820 d = v_reinterpret_as_u32(v1);
821}
822
823 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
824 v_uint64x4& c, v_uint64x4& d)
825{
826 __m256i v0 = _mm256_mul_epu32(a.val, b.val);
827 __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
828 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
829}
830
831 inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
832 inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
833
835 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
836 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
837 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
838 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
839 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
840 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
841 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
842 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
843 { return _Tpsvec(srai(a.val, imm)); } \
844 template<int imm> \
845 inline _Tpuvec v_shl(const _Tpuvec& a) \
846 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
847 template<int imm> \
848 inline _Tpsvec v_shl(const _Tpsvec& a) \
849 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
850 template<int imm> \
851 inline _Tpuvec v_shr(const _Tpuvec& a) \
852 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
853 template<int imm> \
854 inline _Tpsvec v_shr(const _Tpsvec& a) \
855 { return _Tpsvec(srai(a.val, imm)); }
856
857OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
858OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
859
860 inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
861{
862 __m256i d = _mm256_set1_epi64x((int64)1 << 63);
863 __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
864 return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
865}
866OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
867
868
869
870 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
871 OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
872 OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
873 OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
874 inline _Tpvec operator ~ (const _Tpvec& a) \
875 { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
876
877OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
878OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
879OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
880OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
881OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
882OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
883OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
884OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
885OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
886OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
887
889 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
890 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
891 { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
892
893OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
894OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
895OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
896OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
897OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
898OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
899OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
900OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
901
902
903 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
904 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
905 { return ~(a == b); } \
906 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
907 { return b > a; } \
908 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
909 { return ~(a < b); } \
910 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
911 { return b >= a; }
912
913 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
914 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
915 { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
916 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
917 { \
918 __m256i smask = _mm256_set1_##suffix(sbit); \
919 return _Tpuvec(_mm256_cmpgt_##suffix( \
920 _mm256_xor_si256(a.val, smask), \
921 _mm256_xor_si256(b.val, smask))); \
922 } \
923 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
924 { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
925 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
926 { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
927 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
928 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
929
930OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
931OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
932OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
933
934 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
935 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
936 { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
937 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
938 { return ~(a == b); }
939
940OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
941OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
942
943 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
944 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
945 { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
946
947 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
948 OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
949 OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
950 OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
951 OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
952 OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
953 OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
954
955OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
956OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
957
958 inline v_float32x8 v_not_nan(const v_float32x8& a)
959{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
960 inline v_float64x4 v_not_nan(const v_float64x4& a)
961{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
962
964OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
965OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
966OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
967OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
968OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
969OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
970OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
971OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
972OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
973OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
974OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
975OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
976OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
977OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
978OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
979OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
980
981
982 template<int imm>
983 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
984{
985 enum {IMM_R = (16 - imm) & 0xFF};
986 enum {IMM_R2 = (32 - imm) & 0xFF};
987
988 if (imm == 0) return a;
989 if (imm == 32) return b;
990 if (imm > 32) return v_uint8x32();
991
992 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
993 if (imm == 16) return v_uint8x32(swap);
994 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
995 return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
996}
997
998 template<int imm>
999 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
1000{
1001 enum {IMM_L = (imm - 16) & 0xFF};
1002
1003 if (imm == 0) return a;
1004 if (imm == 32) return b;
1005 if (imm > 32) return v_uint8x32();
1006
1007 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
1008 if (imm == 16) return v_uint8x32(swap);
1009 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
1010 return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
1011}
1012
1013 template<int imm>
1014 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
1015{
1016 enum {IMM_L = (imm - 16) & 0xFF};
1017 enum {IMM_R = (16 - imm) & 0xFF};
1018
1019 if (imm == 0) return a;
1020 if (imm > 32) return v_uint8x32();
1021
1022 // ESAC control[3] ? [127:0] = 0
1023 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
1024 if (imm == 16) return v_uint8x32(swapz);
1025 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
1026 return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
1027}
1028
1029 template<int imm>
1030 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
1031{
1032 enum {IMM_L = (imm - 16) & 0xFF};
1033
1034 if (imm == 0) return a;
1035 if (imm > 32) return v_uint8x32();
1036
1037 // ESAC control[3] ? [127:0] = 0
1038 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
1039 if (imm == 16) return v_uint8x32(swapz);
1040 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
1041 return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
1042}
1043
1044 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
1045 template<int imm> \
1046 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1047 { \
1048 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1049 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1050 v_reinterpret_as_u8(b)); \
1051 return _Tpvec(cast(ret.val)); \
1052 } \
1053 template<int imm> \
1054 inline _Tpvec intrin(const _Tpvec& a) \
1055 { \
1056 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1057 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1058 return _Tpvec(cast(ret.val)); \
1059 }
1060
1061 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
1062 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1063 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1064
1065OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
1066OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
1067OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
1068OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
1069OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
1070OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
1071OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
1072
1073OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
1074OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
1075OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
1076OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
1077
1078
1079 inline v_uint8x32 v_reverse(const v_uint8x32 &a)
1080{
1081 static const __m256i perm = _mm256_setr_epi8(
1082 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1083 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1084 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1085 return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
1086}
1087
1088 inline v_int8x32 v_reverse(const v_int8x32 &a)
1089{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1090
1091 inline v_uint16x16 v_reverse(const v_uint16x16 &a)
1092{
1093 static const __m256i perm = _mm256_setr_epi8(
1094 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
1095 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1096 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1097 return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
1098}
1099
1100 inline v_int16x16 v_reverse(const v_int16x16 &a)
1101{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1102
1103 inline v_uint32x8 v_reverse(const v_uint32x8 &a)
1104{
1105 static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1106 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
1107}
1108
1109 inline v_int32x8 v_reverse(const v_int32x8 &a)
1110{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1111
1112 inline v_float32x8 v_reverse(const v_float32x8 &a)
1113{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1114
1115 inline v_uint64x4 v_reverse(const v_uint64x4 &a)
1116{
1117 return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1118}
1119
1120 inline v_int64x4 v_reverse(const v_int64x4 &a)
1121{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1122
1123 inline v_float64x4 v_reverse(const v_float64x4 &a)
1124{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1125
1127
1129 inline unsigned v_reduce_sum(const v_uint8x32& a)
1130{
1131 __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
1132 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1133 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1134}
1135 inline int v_reduce_sum(const v_int8x32& a)
1136{
1137 __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
1138 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1139 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
1140}
1141 #define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1142 inline sctype v_reduce_##func(const _Tpvec& a) \
1143 { \
1144 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1145 val = intrin(val, _mm_srli_si128(val,8)); \
1146 val = intrin(val, _mm_srli_si128(val,4)); \
1147 val = intrin(val, _mm_srli_si128(val,2)); \
1148 val = intrin(val, _mm_srli_si128(val,1)); \
1149 return (sctype)_mm_cvtsi128_si32(val); \
1150 }
1151
1152OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
1153OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
1154OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
1155OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
1156
1157 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1158 inline sctype v_reduce_##func(const _Tpvec& a) \
1159 { \
1160 __m128i v0 = _v256_extract_low(a.val); \
1161 __m128i v1 = _v256_extract_high(a.val); \
1162 v0 = intrin(v0, v1); \
1163 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1164 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1165 v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
1166 return (sctype) _mm_cvtsi128_si32(v0); \
1167 }
1168
1169OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
1170OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
1171OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
1172OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
1173
1174 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1175 inline sctype v_reduce_##func(const _Tpvec& a) \
1176 { \
1177 __m128i v0 = _v256_extract_low(a.val); \
1178 __m128i v1 = _v256_extract_high(a.val); \
1179 v0 = intrin(v0, v1); \
1180 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1181 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1182 return (sctype) _mm_cvtsi128_si32(v0); \
1183 }
1184
1185OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
1186OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
1187OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
1188OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
1189
1190 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
1191 inline float v_reduce_##func(const v_float32x8& a) \
1192 { \
1193 __m128 v0 = _v256_extract_low(a.val); \
1194 __m128 v1 = _v256_extract_high(a.val); \
1195 v0 = intrin(v0, v1); \
1196 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
1197 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
1198 return _mm_cvtss_f32(v0); \
1199 }
1200
1201OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
1202OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
1203
1204 inline int v_reduce_sum(const v_int32x8& a)
1205{
1206 __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1207 s0 = _mm256_hadd_epi32(s0, s0);
1208
1209 __m128i s1 = _v256_extract_high(s0);
1210 s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1211
1212 return _mm_cvtsi128_si32(s1);
1213}
1214
1215 inline unsigned v_reduce_sum(const v_uint32x8& a)
1216{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
1217
1218 inline int v_reduce_sum(const v_int16x16& a)
1219{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1220 inline unsigned v_reduce_sum(const v_uint16x16& a)
1221{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1222
1223 inline float v_reduce_sum(const v_float32x8& a)
1224{
1225 __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1226 s0 = _mm256_hadd_ps(s0, s0);
1227
1228 __m128 s1 = _v256_extract_high(s0);
1229 s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1230
1231 return _mm_cvtss_f32(s1);
1232}
1233
1234 inline uint64 v_reduce_sum(const v_uint64x4& a)
1235{
1236 uint64 CV_DECL_ALIGNED(32) idx[2];
1237 _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1238 return idx[0] + idx[1];
1239}
1240inline int64 v_reduce_sum(const v_int64x4& a)
1241{
1242 int64 CV_DECL_ALIGNED(32) idx[2];
1243 _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1244 return idx[0] + idx[1];
1245}
1246inline double v_reduce_sum(const v_float64x4& a)
1247{
1248 __m256d s0 = _mm256_hadd_pd(a.val, a.val);
1249 return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
1250}
1251
1252 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1253 const v_float32x8& c, const v_float32x8& d)
1254{
1255 __m256 ab = _mm256_hadd_ps(a.val, b.val);
1256 __m256 cd = _mm256_hadd_ps(c.val, d.val);
1257 return v_float32x8(_mm256_hadd_ps(ab, cd));
1258}
1259
1260 inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
1261{
1262 __m256i half = _mm256_sad_epu8(a.val, b.val);
1263 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1264 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1265}
1266 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
1267{
1268 __m256i half = _mm256_set1_epi8(0x7f);
1269 half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
1270 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1271 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1272}
1273 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
1274{
1275 v_uint32x8 l, h;
1276 v_expand(v_add_wrap(a - b, b - a), l, h);
1277 return v_reduce_sum(l + h);
1278}
1279 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
1280{
1281 v_uint32x8 l, h;
1282 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1283 return v_reduce_sum(l + h);
1284}
1285 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
1286{
1287 return v_reduce_sum(v_max(a, b) - v_min(a, b));
1288}
1289 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
1290{
1291 v_int32x8 m = a < b;
1292 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1293}
1294 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
1295{
1296 return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
1297}
1298
1300 inline v_uint8x32 v_popcount(const v_uint8x32& a)
1301{
1302 __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1303 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
1304 __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
1305 return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
1306 _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
1307}
1308 inline v_uint16x16 v_popcount(const v_uint16x16& a)
1309{
1310 v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1311 p += v_rotate_right<1>(p);
1312 return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
1313}
1314 inline v_uint32x8 v_popcount(const v_uint32x8& a)
1315{
1316 v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1317 p += v_rotate_right<1>(p);
1318 p += v_rotate_right<2>(p);
1319 return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
1320}
1321 inline v_uint64x4 v_popcount(const v_uint64x4& a)
1322{
1323 return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
1324}
1325 inline v_uint8x32 v_popcount(const v_int8x32& a)
1326{ return v_popcount(v_reinterpret_as_u8(a)); }
1327 inline v_uint16x16 v_popcount(const v_int16x16& a)
1328{ return v_popcount(v_reinterpret_as_u16(a)); }
1329 inline v_uint32x8 v_popcount(const v_int32x8& a)
1330{ return v_popcount(v_reinterpret_as_u32(a)); }
1331 inline v_uint64x4 v_popcount(const v_int64x4& a)
1332{ return v_popcount(v_reinterpret_as_u64(a)); }
1333
1335 inline int v_signmask(const v_int8x32& a)
1336{ return _mm256_movemask_epi8(a.val); }
1337 inline int v_signmask(const v_uint8x32& a)
1338{ return v_signmask(v_reinterpret_as_s8(a)); }
1339
1340 inline int v_signmask(const v_int16x16& a)
1341{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
1342 inline int v_signmask(const v_uint16x16& a)
1343{ return v_signmask(v_reinterpret_as_s16(a)); }
1344
1345 inline int v_signmask(const v_float32x8& a)
1346{ return _mm256_movemask_ps(a.val); }
1347 inline int v_signmask(const v_float64x4& a)
1348{ return _mm256_movemask_pd(a.val); }
1349
1350 inline int v_signmask(const v_int32x8& a)
1351{ return v_signmask(v_reinterpret_as_f32(a)); }
1352 inline int v_signmask(const v_uint32x8& a)
1353{ return v_signmask(v_reinterpret_as_f32(a)); }
1354
1355 inline int v_signmask(const v_int64x4& a)
1356{ return v_signmask(v_reinterpret_as_f64(a)); }
1357 inline int v_signmask(const v_uint64x4& a)
1358{ return v_signmask(v_reinterpret_as_f64(a)); }
1359
1360 inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1361 inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1362 inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1363 inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1364 inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1365 inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1366 inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1367 inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1368 inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1369 inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1370
1372 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
1373 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1374 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1375OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
1376OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
1377OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
1378OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
1379OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
1380OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
1381OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
1382OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
1383
1384 #define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
1385 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1386 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1387OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
1388OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
1389
1390
1391
1392
1393 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1394 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1395 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1396 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1397 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1398 inline _Tpvec v_sqrt(const _Tpvec& x) \
1399 { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1400 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1401 { return v_fma(a, a, b * b); } \
1402 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1403 { return v_sqrt(v_fma(a, a, b*b)); }
1404
1405OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1406OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1407
1408 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1409{
1410 return a * b + c;
1411}
1412
1413 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1414{
1415 return v_fma(a, b, c);
1416}
1417
1418 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1419{
1420 v_float32x8 half = x * v256_setall_f32(0.5);
1421 v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1422 // todo: _mm256_fnmsub_ps
1423 t *= v256_setall_f32(1.5) - ((t * t) * half);
1424 return t;
1425}
1426
1427 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1428{
1429 return v256_setall_f64(1.) / v_sqrt(x);
1430}
1431
1433 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1434 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1435 { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1436
1437OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1438OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1439OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1440
1441 inline v_float32x8 v_abs(const v_float32x8& x)
1442{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1443 inline v_float64x4 v_abs(const v_float64x4& x)
1444{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1445
1447 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1448{ return v_add_wrap(a - b, b - a); }
1449 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1450{ return v_add_wrap(a - b, b - a); }
1451 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1452{ return v_max(a, b) - v_min(a, b); }
1453
1454 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1455{
1456 v_int8x32 d = v_sub_wrap(a, b);
1457 v_int8x32 m = a < b;
1458 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1459}
1460
1461 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1462{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1463
1464 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1465{
1466 v_int32x8 d = a - b;
1467 v_int32x8 m = a < b;
1468 return v_reinterpret_as_u32((d ^ m) - m);
1469}
1470
1471 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1472{ return v_abs(a - b); }
1473
1474 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1475{ return v_abs(a - b); }
1476
1478 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
1479{
1480 v_int8x32 d = a - b;
1481 v_int8x32 m = a < b;
1482 return (d ^ m) - m;
1483}
1484 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
1485{ return v_max(a, b) - v_min(a, b); }
1486
1488
1490 inline v_int32x8 v_round(const v_float32x8& a)
1491{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1492
1493 inline v_int32x8 v_round(const v_float64x4& a)
1494{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1495
1496 inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
1497{
1498 __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
1499 return v_int32x8(_v256_combine(ai, bi));
1500}
1501
1502 inline v_int32x8 v_trunc(const v_float32x8& a)
1503{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1504
1505 inline v_int32x8 v_trunc(const v_float64x4& a)
1506{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1507
1508 inline v_int32x8 v_floor(const v_float32x8& a)
1509{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1510
1511 inline v_int32x8 v_floor(const v_float64x4& a)
1512{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1513
1514 inline v_int32x8 v_ceil(const v_float32x8& a)
1515{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1516
1517 inline v_int32x8 v_ceil(const v_float64x4& a)
1518{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1519
1521 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1522{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1523
1524 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1525{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1526
1527 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1528{
1529 __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1530 return v_float32x8(_v256_combine(af, bf));
1531}
1532
1533 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1534{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1535
1536 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1537{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1538
1539 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1540{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1541
1542 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1543{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1544
1545 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
1546 inline v_float64x4 v_cvt_f64(const v_int64x4& v)
1547{
1548 // constants encoded as floating-point
1549 __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52
1550 __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
1551 __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
1552 __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
1553
1554 // Blend the 32 lowest significant bits of v with magic_int_lo
1555 __m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
1556 // Extract the 32 most significant bits of v
1557 __m256i v_hi = _mm256_srli_epi64(v.val, 32);
1558 // Flip the msb of v_hi and blend with 0x45300000
1559 v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
1560 // Compute in double precision
1561 __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
1562 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1563 __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
1564 return v_float64x4(result);
1565}
1566
1568
1569 inline v_int8x32 v256_lut(const schar* tab, const int* idx)
1570{
1571 return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1572 tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
1573 tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1574 tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
1575}
1576 inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
1577{
1578 return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
1579 *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
1580 *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
1581 *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
1582}
1583 inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
1584{
1585 return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
1586}
1587 inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
1588 inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
1589 inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
1590
1591 inline v_int16x16 v256_lut(const short* tab, const int* idx)
1592{
1593 return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1594 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
1595}
1596 inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
1597{
1598 return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
1599}
1600 inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
1601{
1602 #if defined(__GNUC__)
1603 return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
1604 #else
1605 return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
1606 #endif
1607}
1608 inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
1609 inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
1610 inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
1611
1612 inline v_int32x8 v256_lut(const int* tab, const int* idx)
1613{
1614 return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1615}
1616 inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
1617{
1618 #if defined(__GNUC__)
1619 return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1620 #else
1621 return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1622 #endif
1623}
1624 inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
1625{
1626 return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1627}
1628 inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
1629 inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
1630 inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
1631
1632 inline v_int64x4 v256_lut(const int64* tab, const int* idx)
1633{
1634 #if defined(__GNUC__)
1635 return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
1636 #else
1637 return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1638 #endif
1639}
1640 inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
1641{
1642 return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1643}
1644 inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
1645 inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
1646
1647 inline v_float32x8 v256_lut(const float* tab, const int* idx)
1648{
1649 return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1650}
1651 inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
1652 inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
1653
1654 inline v_float64x4 v256_lut(const double* tab, const int* idx)
1655{
1656 return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1657}
1658 inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
1659
1660 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1661{
1662 return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
1663}
1664
1665 inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
1666{
1667 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1668}
1669
1670 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1671{
1672 return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
1673}
1674
1675 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1676{
1677 return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
1678}
1679
1680 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1681{
1682 int CV_DECL_ALIGNED(32) idx[8];
1683 v_store_aligned(idx, idxvec);
1684 __m128 z = _mm_setzero_ps();
1685 __m128 xy01, xy45, xy23, xy67;
1686 xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1687 xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1688 xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1689 xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1690 __m256 xy0145 = _v256_combine(xy01, xy45);
1691 xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1692 xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1693 xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1694 xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1695 __m256 xy2367 = _v256_combine(xy23, xy67);
1696
1697 __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1698 __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1699
1700 x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1701 y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1702}
1703
1704inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1705{
1706 int CV_DECL_ALIGNED(32) idx[4];
1707 v_store_low(idx, idxvec);
1708 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1709 __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1710 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1711 __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1712 __m256d xy02 = _v256_combine(xy0, xy2);
1713 __m256d xy13 = _v256_combine(xy1, xy3);
1714
1715 x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1716 y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1717}
1718
1719inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1720{
1721 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1722}
1723 inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1724 inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
1725{
1726 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1727}
1728 inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1729
1730 inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
1731{
1732 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1733}
1734 inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1735 inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
1736{
1737 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1738}
1739 inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1740
1741 inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
1742{
1743 return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
1744}
1745 inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1746 inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1747
1748 inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
1749{
1750 return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
1751 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1752}
1753 inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1754
1755 inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
1756{
1757 return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
1758 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1759}
1760 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1761
1762 inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
1763{
1764 return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1765}
1766 inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1767 inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
1768{
1769 return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1770}
1771
1773
1775
1776 // 16 >> 32
1777 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1778{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1779 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1780{ return v_dotprod(a, b) + c; }
1781
1782 // 32 >> 64
1783 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
1784{
1785 __m256i even = _mm256_mul_epi32(a.val, b.val);
1786 __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
1787 return v_int64x4(_mm256_add_epi64(even, odd));
1788}
1789 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1790{ return v_dotprod(a, b) + c; }
1791
1792 // 8 >> 32
1793 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
1794{
1795 __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
1796 __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
1797 __m256i odd_a = _mm256_srli_epi16(a.val, 8);
1798
1799 __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
1800 __m256i odd_b = _mm256_srli_epi16(b.val, 8);
1801
1802 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1803 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1804 return v_uint32x8(_mm256_add_epi32(prod0, prod1));
1805}
1806 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1807{ return v_dotprod_expand(a, b) + c; }
1808
1809 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
1810{
1811 __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
1812 __m256i odd_a = _mm256_srai_epi16(a.val, 8);
1813
1814 __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
1815 __m256i odd_b = _mm256_srai_epi16(b.val, 8);
1816
1817 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1818 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1819 return v_int32x8(_mm256_add_epi32(prod0, prod1));
1820}
1821 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1822{ return v_dotprod_expand(a, b) + c; }
1823
1824 // 16 >> 64
1825 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
1826{
1827 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1828 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1829 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1830 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1831
1832 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1833 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1834 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1835 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1836
1837 __m256i p15_ = _mm256_add_epi64(p02, p13);
1838 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1839
1840 return v_uint64x4(_mm256_add_epi64(
1841 _mm256_unpacklo_epi64(p15_, p9d_),
1842 _mm256_unpackhi_epi64(p15_, p9d_)
1843 ));
1844}
1845 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1846{ return v_dotprod_expand(a, b) + c; }
1847
1848 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
1849{
1850 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1851 __m256i sign = _mm256_srai_epi32(prod, 31);
1852
1853 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1854 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1855
1856 return v_int64x4(_mm256_add_epi64(
1857 _mm256_unpacklo_epi64(lo, hi),
1858 _mm256_unpackhi_epi64(lo, hi)
1859 ));
1860}
1861 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1862{ return v_dotprod_expand(a, b) + c; }
1863
1864 // 32 >> 64f
1865 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
1866{ return v_cvt_f64(v_dotprod(a, b)); }
1867 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1868{ return v_dotprod_expand(a, b) + c; }
1869
1871
1872 // 16 >> 32
1873 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
1874{ return v_dotprod(a, b); }
1875 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1876{ return v_dotprod(a, b, c); }
1877
1878 // 32 >> 64
1879 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
1880{ return v_dotprod(a, b); }
1881 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1882{ return v_dotprod(a, b, c); }
1883
1884 // 8 >> 32
1885 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
1886{ return v_dotprod_expand(a, b); }
1887 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1888{ return v_dotprod_expand(a, b, c); }
1889
1890 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
1891{ return v_dotprod_expand(a, b); }
1892 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1893{ return v_dotprod_expand(a, b, c); }
1894
1895 // 16 >> 64
1896 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
1897{
1898 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1899 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1900 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1901 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1902
1903 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1904 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1905 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1906 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1907
1908 __m256i p15_ = _mm256_add_epi64(p02, p13);
1909 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1910
1911 return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
1912}
1913 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1914{ return v_dotprod_expand_fast(a, b) + c; }
1915
1916 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
1917{
1918 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1919 __m256i sign = _mm256_srai_epi32(prod, 31);
1920 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1921 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1922 return v_int64x4(_mm256_add_epi64(lo, hi));
1923}
1924 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1925{ return v_dotprod_expand_fast(a, b) + c; }
1926
1927 // 32 >> 64f
1928 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
1929{ return v_dotprod_expand(a, b); }
1930 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1931{ return v_dotprod_expand(a, b, c); }
1932
1933 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1934 v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1935
1936 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1937 const v_float32x8& m1, const v_float32x8& m2,
1938 const v_float32x8& m3)
1939{
1940 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1941 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1942 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1943 v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1944 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1945}
1946
1947 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1948 const v_float32x8& m1, const v_float32x8& m2,
1949 const v_float32x8& a)
1950{
1951 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1952 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1953 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1954 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1955}
1956
1957 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1958 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1959 const _Tpvec& a2, const _Tpvec& a3, \
1960 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1961 { \
1962 __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1963 __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1964 __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1965 __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1966 b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1967 b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1968 b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1969 b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1970 }
1971
1972OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1973OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1974OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1975
1976
1977
1978 /* Expand */
1979 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1980 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1981 { \
1982 b0.val = intrin(_v256_extract_low(a.val)); \
1983 b1.val = intrin(_v256_extract_high(a.val)); \
1984 } \
1985 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1986 { return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
1987 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1988 { return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
1989 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
1990 { \
1991 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
1992 return _Tpwvec(intrin(a)); \
1993 }
1994
1995OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
1996OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
1997OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
1998OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
1999OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
2000OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
2001
2002 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2003 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2004 { \
2005 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
2006 return _Tpvec(intrin(a)); \
2007 }
2008
2009OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
2010OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
2011
2012 /* pack */
2013 // 16
2014 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
2015{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
2016
2017 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
2018{
2019 __m256i t = _mm256_set1_epi16(255);
2020 __m256i a1 = _mm256_min_epu16(a.val, t);
2021 __m256i b1 = _mm256_min_epu16(b.val, t);
2022 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
2023}
2024
2025 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
2026{
2027 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
2028}
2029
2030 inline void v_pack_store(schar* ptr, const v_int16x16& a)
2031{ v_store_low(ptr, v_pack(a, a)); }
2032
2033 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
2034{
2035 const __m256i m = _mm256_set1_epi16(255);
2036 __m256i am = _mm256_min_epu16(a.val, m);
2037 am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
2038 v_store_low(ptr, v_uint8x32(am));
2039}
2040
2041 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
2042{ v_store_low(ptr, v_pack_u(a, a)); }
2043
2044 template<int n> inline
2045v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
2046{
2047 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2048 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2049 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2050 v_reinterpret_as_s16((b + delta) >> n));
2051}
2052
2053 template<int n> inline
2054 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
2055{
2056 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2057 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2058}
2059
2060 template<int n> inline
2061v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
2062{
2063 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2064 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2065}
2066
2067 template<int n> inline
2068 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
2069{
2070 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2071 v_pack_u_store(ptr, (a + delta) >> n);
2072}
2073
2074 template<int n> inline
2075v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
2076{
2077 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2078 return v_pack((a + delta) >> n, (b + delta) >> n);
2079}
2080
2081 template<int n> inline
2082 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
2083{
2084 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2085 v_pack_store(ptr, (a + delta) >> n);
2086}
2087
2088 // 32
2089 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
2090{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
2091
2092 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
2093{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2094
2095 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
2096{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
2097
2098 inline void v_pack_store(short* ptr, const v_int32x8& a)
2099{ v_store_low(ptr, v_pack(a, a)); }
2100
2101 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
2102{
2103 const __m256i m = _mm256_set1_epi32(65535);
2104 __m256i am = _mm256_min_epu32(a.val, m);
2105 am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
2106 v_store_low(ptr, v_uint16x16(am));
2107}
2108
2109 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
2110{ v_store_low(ptr, v_pack_u(a, a)); }
2111
2112
2113 template<int n> inline
2114v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
2115{
2116 // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
2117 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2118 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2119 v_reinterpret_as_s32((b + delta) >> n));
2120}
2121
2122 template<int n> inline
2123 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
2124{
2125 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2126 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2127}
2128
2129 template<int n> inline
2130v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
2131{
2132 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2133 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2134}
2135
2136 template<int n> inline
2137 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
2138{
2139 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2140 v_pack_u_store(ptr, (a + delta) >> n);
2141}
2142
2143 template<int n> inline
2144v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
2145{
2146 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2147 return v_pack((a + delta) >> n, (b + delta) >> n);
2148}
2149
2150 template<int n> inline
2151 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
2152{
2153 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2154 v_pack_store(ptr, (a + delta) >> n);
2155}
2156
2157 // 64
2158 // Non-saturating pack
2159 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
2160{
2161 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2162 __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
2163 __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
2164 return v_uint32x8(_v256_shuffle_odd_64(ab));
2165}
2166
2167 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
2168{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2169
2170 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
2171{
2172 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2173 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2174}
2175
2176 inline void v_pack_store(int* ptr, const v_int64x4& b)
2177{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2178
2179 template<int n> inline
2180v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
2181{
2182 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2183 return v_pack((a + delta) >> n, (b + delta) >> n);
2184}
2185
2186 template<int n> inline
2187 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
2188{
2189 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2190 v_pack_store(ptr, (a + delta) >> n);
2191}
2192
2193 template<int n> inline
2194v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
2195{
2196 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2197 return v_pack((a + delta) >> n, (b + delta) >> n);
2198}
2199
2200 template<int n> inline
2201 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
2202{
2203 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2204 v_pack_store(ptr, (a + delta) >> n);
2205}
2206
2207 // pack boolean
2208 inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
2209{
2210 __m256i ab = _mm256_packs_epi16(a.val, b.val);
2211 return v_uint8x32(_v256_shuffle_odd_64(ab));
2212}
2213
2214 inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
2215 const v_uint32x8& c, const v_uint32x8& d)
2216{
2217 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2218 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2219
2220 __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
2221 return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
2222}
2223
2224 inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2225 const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
2226 const v_uint64x4& g, const v_uint64x4& h)
2227{
2228 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2229 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2230 __m256i ef = _mm256_packs_epi32(e.val, f.val);
2231 __m256i gh = _mm256_packs_epi32(g.val, h.val);
2232
2233 __m256i abcd = _mm256_packs_epi32(ab, cd);
2234 __m256i efgh = _mm256_packs_epi32(ef, gh);
2235 __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
2236
2237 __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
2238 return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
2239}
2240
2241 /* Recombine */
2242 // its up there with load and store operations
2243
2244 /* Extract */
2245 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
2246 template<int s> \
2247 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2248 { return v_rotate_right<s>(a, b); }
2249
2250OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
2251OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
2252OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
2253OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
2254OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
2255OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
2256OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
2257OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
2258OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
2259OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
2260
2261 template<int i>
2262 inline uchar v_extract_n(v_uint8x32 a)
2263{
2264 return (uchar)_v256_extract_epi8<i>(a.val);
2265}
2266
2267 template<int i>
2268 inline schar v_extract_n(v_int8x32 a)
2269{
2270 return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2271}
2272
2273 template<int i>
2274 inline ushort v_extract_n(v_uint16x16 a)
2275{
2276 return (ushort)_v256_extract_epi16<i>(a.val);
2277}
2278
2279 template<int i>
2280 inline short v_extract_n(v_int16x16 a)
2281{
2282 return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
2283}
2284
2285 template<int i>
2286 inline uint v_extract_n(v_uint32x8 a)
2287{
2288 return (uint)_v256_extract_epi32<i>(a.val);
2289}
2290
2291 template<int i>
2292 inline int v_extract_n(v_int32x8 a)
2293{
2294 return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
2295}
2296
2297 template<int i>
2298 inline uint64 v_extract_n(v_uint64x4 a)
2299{
2300 return (uint64)_v256_extract_epi64<i>(a.val);
2301}
2302
2303 template<int i>
2304 inline int64 v_extract_n(v_int64x4 v)
2305{
2306 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2307}
2308
2309 template<int i>
2310 inline float v_extract_n(v_float32x8 v)
2311{
2312 union { uint iv; float fv; } d;
2313 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2314 return d.fv;
2315}
2316
2317 template<int i>
2318 inline double v_extract_n(v_float64x4 v)
2319{
2320 union { uint64 iv; double dv; } d;
2321 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2322 return d.dv;
2323}
2324
2325 template<int i>
2326 inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
2327{
2328 static const __m256i perm = _mm256_set1_epi32((char)i);
2329 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
2330}
2331
2332 template<int i>
2333 inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
2334{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2335
2336 template<int i>
2337 inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
2338{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2339
2340
2342
2343 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
2344{
2345 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2346 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2347
2348 const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
2349 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
2350 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2351 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2352 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2353 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2354 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2355 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2356 a = v_uint8x32(a0);
2357 b = v_uint8x32(b0);
2358}
2359
2360 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
2361{
2362 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2363 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2364
2365 const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2366 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
2367 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2368 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2369 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2370 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2371 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2372 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2373 a = v_uint16x16(a0);
2374 b = v_uint16x16(b0);
2375}
2376
2377 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
2378{
2379 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2380 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2381
2382 const int sh = 0+2*4+1*16+3*64;
2383 __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
2384 __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
2385 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2386 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2387 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2388 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2389 a = v_uint32x8(a0);
2390 b = v_uint32x8(b0);
2391}
2392
2393 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
2394{
2395 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2396 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2397
2398 __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
2399 __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
2400 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2401 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2402 a = v_uint64x4(a0);
2403 b = v_uint64x4(b0);
2404}
2405
2406 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2407{
2408 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2409 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2410 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2411
2412 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2413 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2414
2415 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2416 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2417 const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2418 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2419
2420 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2421 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2422 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2423
2424 const __m256i
2425 sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2426 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2427 sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2428 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2429 sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2430 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2431 b0 = _mm256_shuffle_epi8(b0, sh_b);
2432 g0 = _mm256_shuffle_epi8(g0, sh_g);
2433 r0 = _mm256_shuffle_epi8(r0, sh_r);
2434
2435 a = v_uint8x32(b0);
2436 b = v_uint8x32(g0);
2437 c = v_uint8x32(r0);
2438}
2439
2440 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2441{
2442 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2443 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2444 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2445
2446 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2447 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2448
2449 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2450 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2451 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2452 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2453 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2454 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2455 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2456 const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2457 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2458 const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2459 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2460 const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2461 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2462 b0 = _mm256_shuffle_epi8(b0, sh_b);
2463 g0 = _mm256_shuffle_epi8(g0, sh_g);
2464 r0 = _mm256_shuffle_epi8(r0, sh_r);
2465
2466 a = v_uint16x16(b0);
2467 b = v_uint16x16(g0);
2468 c = v_uint16x16(r0);
2469}
2470
2471 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2472{
2473 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2474 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2475 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2476
2477 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2478 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2479
2480 __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
2481 __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
2482 __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
2483
2484 b0 = _mm256_shuffle_epi32(b0, 0x6c);
2485 g0 = _mm256_shuffle_epi32(g0, 0xb1);
2486 r0 = _mm256_shuffle_epi32(r0, 0xc6);
2487
2488 a = v_uint32x8(b0);
2489 b = v_uint32x8(g0);
2490 c = v_uint32x8(r0);
2491}
2492
2493 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2494{
2495 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2496 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2497 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2498
2499 __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
2500 __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
2501 __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
2502 __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
2503 __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
2504 __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
2505
2506 a = v_uint64x4(b0);
2507 b = v_uint64x4(g0);
2508 c = v_uint64x4(r0);
2509}
2510
2511 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
2512{
2513 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2514 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2515 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2516 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
2517 const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
2518 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
2519
2520 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2521 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2522 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2523 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2524
2525 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2526 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2527 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2528 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2529
2530 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2531 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2532 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2533 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2534
2535 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2536 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2537 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2538 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2539
2540 a = v_uint8x32(b0);
2541 b = v_uint8x32(g0);
2542 c = v_uint8x32(r0);
2543 d = v_uint8x32(a0);
2544}
2545
2546 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
2547{
2548 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2549 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2550 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2551 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
2552 const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2553 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
2554 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2555 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2556 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2557 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2558
2559 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2560 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2561 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2562 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2563
2564 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2565 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2566 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2567 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2568
2569 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2570 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2571 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2572 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2573
2574 a = v_uint16x16(b0);
2575 b = v_uint16x16(g0);
2576 c = v_uint16x16(r0);
2577 d = v_uint16x16(a0);
2578}
2579
2580 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2581{
2582 __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
2583 __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2584 __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2585 __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
2586
2587 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2588 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2589 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2590 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2591
2592 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2593 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2594 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2595 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2596
2597 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2598 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2599 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2600 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2601
2602 a = v_uint32x8(b0);
2603 b = v_uint32x8(g0);
2604 c = v_uint32x8(r0);
2605 d = v_uint32x8(a0);
2606}
2607
2608 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2609{
2610 __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
2611 __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2612 __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2613 __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
2614
2615 __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
2616 __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
2617 __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
2618 __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
2619
2620 __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
2621 __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
2622 __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
2623 __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
2624
2625 a = v_uint64x4(b0);
2626 b = v_uint64x4(g0);
2627 c = v_uint64x4(r0);
2628 d = v_uint64x4(a0);
2629}
2630
2632
2633 inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
2634 hal::StoreMode mode=hal::STORE_UNALIGNED )
2635{
2636 __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
2637 __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
2638
2639 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2640 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2641
2642 if( mode == hal::STORE_ALIGNED_NOCACHE )
2643 {
2644 _mm256_stream_si256((__m256i*)ptr, xy0);
2645 _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
2646 }
2647 else if( mode == hal::STORE_ALIGNED )
2648 {
2649 _mm256_store_si256((__m256i*)ptr, xy0);
2650 _mm256_store_si256((__m256i*)(ptr + 32), xy1);
2651 }
2652 else
2653 {
2654 _mm256_storeu_si256((__m256i*)ptr, xy0);
2655 _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
2656 }
2657}
2658
2659 inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
2660 hal::StoreMode mode=hal::STORE_UNALIGNED )
2661{
2662 __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
2663 __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
2664
2665 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2666 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2667
2668 if( mode == hal::STORE_ALIGNED_NOCACHE )
2669 {
2670 _mm256_stream_si256((__m256i*)ptr, xy0);
2671 _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
2672 }
2673 else if( mode == hal::STORE_ALIGNED )
2674 {
2675 _mm256_store_si256((__m256i*)ptr, xy0);
2676 _mm256_store_si256((__m256i*)(ptr + 16), xy1);
2677 }
2678 else
2679 {
2680 _mm256_storeu_si256((__m256i*)ptr, xy0);
2681 _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
2682 }
2683}
2684
2685 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
2686 hal::StoreMode mode=hal::STORE_UNALIGNED )
2687{
2688 __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
2689 __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
2690
2691 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2692 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2693
2694 if( mode == hal::STORE_ALIGNED_NOCACHE )
2695 {
2696 _mm256_stream_si256((__m256i*)ptr, xy0);
2697 _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
2698 }
2699 else if( mode == hal::STORE_ALIGNED )
2700 {
2701 _mm256_store_si256((__m256i*)ptr, xy0);
2702 _mm256_store_si256((__m256i*)(ptr + 8), xy1);
2703 }
2704 else
2705 {
2706 _mm256_storeu_si256((__m256i*)ptr, xy0);
2707 _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2708 }
2709}
2710
2711 inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2712 hal::StoreMode mode=hal::STORE_UNALIGNED )
2713{
2714 __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2715 __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2716
2717 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2718 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2719
2720 if( mode == hal::STORE_ALIGNED_NOCACHE )
2721 {
2722 _mm256_stream_si256((__m256i*)ptr, xy0);
2723 _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2724 }
2725 else if( mode == hal::STORE_ALIGNED )
2726 {
2727 _mm256_store_si256((__m256i*)ptr, xy0);
2728 _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2729 }
2730 else
2731 {
2732 _mm256_storeu_si256((__m256i*)ptr, xy0);
2733 _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2734 }
2735}
2736
2737 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
2738 hal::StoreMode mode=hal::STORE_UNALIGNED )
2739{
2740 const __m256i sh_b = _mm256_setr_epi8(
2741 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2742 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2743 const __m256i sh_g = _mm256_setr_epi8(
2744 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2745 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2746 const __m256i sh_r = _mm256_setr_epi8(
2747 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2748 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2749
2750 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2751 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2752 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2753
2754 const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2755 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2756 const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2757 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2758
2759 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2760 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2761 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2762
2763 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2764 __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2765 __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2766
2767 if( mode == hal::STORE_ALIGNED_NOCACHE )
2768 {
2769 _mm256_stream_si256((__m256i*)ptr, bgr0);
2770 _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2771 _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2772 }
2773 else if( mode == hal::STORE_ALIGNED )
2774 {
2775 _mm256_store_si256((__m256i*)ptr, bgr0);
2776 _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2777 _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2778 }
2779 else
2780 {
2781 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2782 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2783 _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2784 }
2785}
2786
2787 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
2788 hal::StoreMode mode=hal::STORE_UNALIGNED )
2789{
2790 const __m256i sh_b = _mm256_setr_epi8(
2791 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2792 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2793 const __m256i sh_g = _mm256_setr_epi8(
2794 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2795 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2796 const __m256i sh_r = _mm256_setr_epi8(
2797 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2798 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2799
2800 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2801 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2802 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2803
2804 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2805 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2806 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2807 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2808
2809 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2810 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2811 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2812
2813 __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2814 //__m256i bgr1 = p1;
2815 __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2816
2817 if( mode == hal::STORE_ALIGNED_NOCACHE )
2818 {
2819 _mm256_stream_si256((__m256i*)ptr, bgr0);
2820 _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2821 _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2822 }
2823 else if( mode == hal::STORE_ALIGNED )
2824 {
2825 _mm256_store_si256((__m256i*)ptr, bgr0);
2826 _mm256_store_si256((__m256i*)(ptr + 16), p1);
2827 _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2828 }
2829 else
2830 {
2831 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2832 _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2833 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2834 }
2835}
2836
2837 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
2838 hal::StoreMode mode=hal::STORE_UNALIGNED )
2839{
2840 __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
2841 __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
2842 __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
2843
2844 __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2845 __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2846 __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2847
2848 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2849 //__m256i bgr1 = p2;
2850 __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2851
2852 if( mode == hal::STORE_ALIGNED_NOCACHE )
2853 {
2854 _mm256_stream_si256((__m256i*)ptr, bgr0);
2855 _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2856 _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2857 }
2858 else if( mode == hal::STORE_ALIGNED )
2859 {
2860 _mm256_store_si256((__m256i*)ptr, bgr0);
2861 _mm256_store_si256((__m256i*)(ptr + 8), p2);
2862 _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2863 }
2864 else
2865 {
2866 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2867 _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2868 _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2869 }
2870}
2871
2872 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2873 hal::StoreMode mode=hal::STORE_UNALIGNED )
2874{
2875 __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
2876 __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
2877 __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
2878
2879 __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2880 __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2881 __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2882
2883 if( mode == hal::STORE_ALIGNED_NOCACHE )
2884 {
2885 _mm256_stream_si256((__m256i*)ptr, bgr0);
2886 _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2887 _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2888 }
2889 else if( mode == hal::STORE_ALIGNED )
2890 {
2891 _mm256_store_si256((__m256i*)ptr, bgr0);
2892 _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2893 _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2894 }
2895 else
2896 {
2897 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2898 _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2899 _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2900 }
2901}
2902
2903 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
2904 const v_uint8x32& c, const v_uint8x32& d,
2905 hal::StoreMode mode=hal::STORE_UNALIGNED )
2906{
2907 __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
2908 __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
2909 __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
2910 __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
2911
2912 __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2913 __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2914 __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2915 __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2916
2917 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2918 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2919 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2920 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2921
2922 if( mode == hal::STORE_ALIGNED_NOCACHE )
2923 {
2924 _mm256_stream_si256((__m256i*)ptr, bgra0);
2925 _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2926 _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2927 _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2928 }
2929 else if( mode == hal::STORE_ALIGNED )
2930 {
2931 _mm256_store_si256((__m256i*)ptr, bgra0);
2932 _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2933 _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2934 _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2935 }
2936 else
2937 {
2938 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2939 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2940 _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2941 _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2942 }
2943}
2944
2945 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
2946 const v_uint16x16& c, const v_uint16x16& d,
2947 hal::StoreMode mode=hal::STORE_UNALIGNED )
2948{
2949 __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
2950 __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
2951 __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
2952 __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
2953
2954 __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2955 __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2956 __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2957 __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2958
2959 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2960 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2961 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2962 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2963
2964 if( mode == hal::STORE_ALIGNED_NOCACHE )
2965 {
2966 _mm256_stream_si256((__m256i*)ptr, bgra0);
2967 _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2968 _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2969 _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2970 }
2971 else if( mode == hal::STORE_ALIGNED )
2972 {
2973 _mm256_store_si256((__m256i*)ptr, bgra0);
2974 _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2975 _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2976 _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2977 }
2978 else
2979 {
2980 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2981 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2982 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2983 _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2984 }
2985}
2986
2987 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
2988 const v_uint32x8& c, const v_uint32x8& d,
2989 hal::StoreMode mode=hal::STORE_UNALIGNED )
2990{
2991 __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
2992 __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
2993 __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
2994 __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
2995
2996 __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
2997 __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
2998 __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
2999 __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
3000
3001 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
3002 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
3003 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
3004 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
3005
3006 if( mode == hal::STORE_ALIGNED_NOCACHE )
3007 {
3008 _mm256_stream_si256((__m256i*)ptr, bgra0);
3009 _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
3010 _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
3011 _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
3012 }
3013 else if( mode == hal::STORE_ALIGNED )
3014 {
3015 _mm256_store_si256((__m256i*)ptr, bgra0);
3016 _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
3017 _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
3018 _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
3019 }
3020 else
3021 {
3022 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3023 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
3024 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
3025 _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
3026 }
3027}
3028
3029 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
3030 const v_uint64x4& c, const v_uint64x4& d,
3031 hal::StoreMode mode=hal::STORE_UNALIGNED )
3032{
3033 __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
3034 __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
3035 __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
3036 __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
3037
3038 __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
3039 __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
3040 __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
3041 __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
3042
3043 if( mode == hal::STORE_ALIGNED_NOCACHE )
3044 {
3045 _mm256_stream_si256((__m256i*)ptr, bgra0);
3046 _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
3047 _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
3048 _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
3049 }
3050 else if( mode == hal::STORE_ALIGNED )
3051 {
3052 _mm256_store_si256((__m256i*)ptr, bgra0);
3053 _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
3054 _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
3055 _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
3056 }
3057 else
3058 {
3059 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3060 _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
3061 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
3062 _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
3063 }
3064}
3065
3066 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
3067 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
3068 { \
3069 _Tpvec1 a1, b1; \
3070 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
3071 a0 = v_reinterpret_as_##suffix0(a1); \
3072 b0 = v_reinterpret_as_##suffix0(b1); \
3073 } \
3074 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
3075 { \
3076 _Tpvec1 a1, b1, c1; \
3077 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
3078 a0 = v_reinterpret_as_##suffix0(a1); \
3079 b0 = v_reinterpret_as_##suffix0(b1); \
3080 c0 = v_reinterpret_as_##suffix0(c1); \
3081 } \
3082 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
3083 { \
3084 _Tpvec1 a1, b1, c1, d1; \
3085 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
3086 a0 = v_reinterpret_as_##suffix0(a1); \
3087 b0 = v_reinterpret_as_##suffix0(b1); \
3088 c0 = v_reinterpret_as_##suffix0(c1); \
3089 d0 = v_reinterpret_as_##suffix0(d1); \
3090 } \
3091 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3092 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3093 { \
3094 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3095 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3096 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3097 } \
3098 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3099 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3100 { \
3101 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3102 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3103 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3104 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3105 } \
3106 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3107 const _Tpvec0& c0, const _Tpvec0& d0, \
3108 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3109 { \
3110 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3111 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3112 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3113 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3114 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3115 }
3116
3117OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
3118OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
3119OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
3120OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
3121OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
3122OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
3123
3124 //
3125 // FP16
3126 //
3127
3128 inline v_float32x8 v256_load_expand(const float16_t* ptr)
3129{
3130 #if CV_FP16
3131 return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3132 #else
3133 float CV_DECL_ALIGNED(32) buf[8];
3134 for (int i = 0; i < 8; i++)
3135 buf[i] = (float)ptr[i];
3136 return v256_load_aligned(buf);
3137 #endif
3138}
3139
3140 inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
3141{
3142 #if CV_FP16
3143 __m128i ah = _mm256_cvtps_ph(a.val, 0);
3144 _mm_storeu_si128((__m128i*)ptr, ah);
3145 #else
3146 float CV_DECL_ALIGNED(32) buf[8];
3147 v_store_aligned(buf, a);
3148 for (int i = 0; i < 8; i++)
3149 ptr[i] = float16_t(buf[i]);
3150 #endif
3151}
3152
3153 //
3154 // end of FP16
3155 //
3156
3157 inline void v256_cleanup() { _mm256_zeroall(); }
3158
3159CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3160
3162
3163} // cv::
3164
3165 #endif // OPENCV_HAL_INTRIN_AVX_HPP
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition: intrin_cpp.hpp:1557
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition: intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition: intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
CV_EXPORTS void swap(Mat &a, Mat &b)
Swaps two matrices
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75