OpenCV 4.5.3(日本語機械翻訳)
intrin_avx512.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5 #ifndef OPENCV_HAL_INTRIN_AVX512_HPP
6 #define OPENCV_HAL_INTRIN_AVX512_HPP
7
8 #if defined(_MSC_VER) && (_MSC_VER < 1920 /*MSVS2019*/ )
9 # pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
10 # pragma warning(disable:4309) // 'argument': truncation of constant value
11 # pragma warning(disable:4310) // cast truncates constant value
12 #endif
13
14 #define CVT_ROUND_MODES_IMPLEMENTED 0
15
16 #define CV_SIMD512 1
17 #define CV_SIMD512_64F 1
18 #define CV_SIMD512_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
19
20 #define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
21 #define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
22 _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
23 ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
24 #define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
25 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
26 _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
27 ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
28 ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
29 ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
30 #define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
31 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
32 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
33 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
34 _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
35 ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
36 ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
37 ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
38 ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
39 ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
40 ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
41 ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
42 #define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
43 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
44 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
45 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
46 _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
47 (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
48 (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
49 (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
50 (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
51 (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
52 (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
53 (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
54
55 #ifndef _mm512_cvtpd_pslo
56 #ifdef _mm512_zextsi256_si512
57 #define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
58 #else
59 //if preferred way to extend with zeros is unavailable
60 #define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
61 #endif
62 #endif
64
65 namespace
66{
67
68 inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
69{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
70
71 inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
72{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
73
74 inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
75{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
76
77 inline int _v_cvtsi512_si32(const __m512i& a)
78{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
79
80 inline __m256i _v512_extract_high(const __m512i& v)
81{ return _mm512_extracti32x8_epi32(v, 1); }
82
83 inline __m256 _v512_extract_high(const __m512& v)
84{ return _mm512_extractf32x8_ps(v, 1); }
85
86 inline __m256d _v512_extract_high(const __m512d& v)
87{ return _mm512_extractf64x4_pd(v, 1); }
88
89 inline __m256i _v512_extract_low(const __m512i& v)
90{ return _mm512_castsi512_si256(v); }
91
92 inline __m256 _v512_extract_low(const __m512& v)
93{ return _mm512_castps512_ps256(v); }
94
95 inline __m256d _v512_extract_low(const __m512d& v)
96{ return _mm512_castpd512_pd256(v); }
97
98 inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
99{ return _mm512_inserti32x8(a, b, 0); }
100
101 inline __m512 _v512_insert(const __m512& a, const __m256& b)
102{ return _mm512_insertf32x8(a, b, 0); }
103
104 inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
105{ return _mm512_insertf64x4(a, b, 0); }
106
107}
108
109 namespace cv
110{
111
113
114CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
115
117
118 struct v_uint8x64
119{
120 typedef uchar lane_type;
121 enum { nlanes = 64 };
122 __m512i val;
123
124 explicit v_uint8x64(__m512i v) : val(v) {}
125 v_uint8x64(uchar v0, uchar v1, uchar v2, uchar v3,
126 uchar v4, uchar v5, uchar v6, uchar v7,
127 uchar v8, uchar v9, uchar v10, uchar v11,
128 uchar v12, uchar v13, uchar v14, uchar v15,
129 uchar v16, uchar v17, uchar v18, uchar v19,
130 uchar v20, uchar v21, uchar v22, uchar v23,
131 uchar v24, uchar v25, uchar v26, uchar v27,
132 uchar v28, uchar v29, uchar v30, uchar v31,
133 uchar v32, uchar v33, uchar v34, uchar v35,
134 uchar v36, uchar v37, uchar v38, uchar v39,
135 uchar v40, uchar v41, uchar v42, uchar v43,
136 uchar v44, uchar v45, uchar v46, uchar v47,
137 uchar v48, uchar v49, uchar v50, uchar v51,
138 uchar v52, uchar v53, uchar v54, uchar v55,
139 uchar v56, uchar v57, uchar v58, uchar v59,
140 uchar v60, uchar v61, uchar v62, uchar v63)
141 {
142 val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
143 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
144 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
145 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
146 }
147 v_uint8x64() {}
148
149 static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
150
151 uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
152};
153
154 struct v_int8x64
155{
156 typedef schar lane_type;
157 enum { nlanes = 64 };
158 __m512i val;
159
160 explicit v_int8x64(__m512i v) : val(v) {}
161 v_int8x64(schar v0, schar v1, schar v2, schar v3,
162 schar v4, schar v5, schar v6, schar v7,
163 schar v8, schar v9, schar v10, schar v11,
164 schar v12, schar v13, schar v14, schar v15,
165 schar v16, schar v17, schar v18, schar v19,
166 schar v20, schar v21, schar v22, schar v23,
167 schar v24, schar v25, schar v26, schar v27,
168 schar v28, schar v29, schar v30, schar v31,
169 schar v32, schar v33, schar v34, schar v35,
170 schar v36, schar v37, schar v38, schar v39,
171 schar v40, schar v41, schar v42, schar v43,
172 schar v44, schar v45, schar v46, schar v47,
173 schar v48, schar v49, schar v50, schar v51,
174 schar v52, schar v53, schar v54, schar v55,
175 schar v56, schar v57, schar v58, schar v59,
176 schar v60, schar v61, schar v62, schar v63)
177 {
178 val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
179 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
180 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
181 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
182 }
183 v_int8x64() {}
184
185 static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
186
187 schar get0() const { return (schar)_v_cvtsi512_si32(val); }
188};
189
190 struct v_uint16x32
191{
192 typedef ushort lane_type;
193 enum { nlanes = 32 };
194 __m512i val;
195
196 explicit v_uint16x32(__m512i v) : val(v) {}
197 v_uint16x32(ushort v0, ushort v1, ushort v2, ushort v3,
198 ushort v4, ushort v5, ushort v6, ushort v7,
199 ushort v8, ushort v9, ushort v10, ushort v11,
200 ushort v12, ushort v13, ushort v14, ushort v15,
201 ushort v16, ushort v17, ushort v18, ushort v19,
202 ushort v20, ushort v21, ushort v22, ushort v23,
203 ushort v24, ushort v25, ushort v26, ushort v27,
204 ushort v28, ushort v29, ushort v30, ushort v31)
205 {
206 val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
207 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
208 }
209 v_uint16x32() {}
210
211 static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
212
213 ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
214};
215
216 struct v_int16x32
217{
218 typedef short lane_type;
219 enum { nlanes = 32 };
220 __m512i val;
221
222 explicit v_int16x32(__m512i v) : val(v) {}
223 v_int16x32(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
224 short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15,
225 short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
226 short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
227 {
228 val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
229 (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
230 (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
231 (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
232 }
233 v_int16x32() {}
234
235 static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
236
237 short get0() const { return (short)_v_cvtsi512_si32(val); }
238};
239
240 struct v_uint32x16
241{
242 typedef unsigned lane_type;
243 enum { nlanes = 16 };
244 __m512i val;
245
246 explicit v_uint32x16(__m512i v) : val(v) {}
247 v_uint32x16(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
248 unsigned v4, unsigned v5, unsigned v6, unsigned v7,
249 unsigned v8, unsigned v9, unsigned v10, unsigned v11,
250 unsigned v12, unsigned v13, unsigned v14, unsigned v15)
251 {
252 val = _mm512_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3, (int)v4, (int)v5, (int)v6, (int)v7,
253 (int)v8, (int)v9, (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
254 }
255 v_uint32x16() {}
256
257 static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
258
259 unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
260};
261
262 struct v_int32x16
263{
264 typedef int lane_type;
265 enum { nlanes = 16 };
266 __m512i val;
267
268 explicit v_int32x16(__m512i v) : val(v) {}
269 v_int32x16(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
270 int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
271 {
272 val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
273 }
274 v_int32x16() {}
275
276 static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
277
278 int get0() const { return _v_cvtsi512_si32(val); }
279};
280
281 struct v_float32x16
282{
283 typedef float lane_type;
284 enum { nlanes = 16 };
285 __m512 val;
286
287 explicit v_float32x16(__m512 v) : val(v) {}
288 v_float32x16(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7,
289 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
290 {
291 val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
292 }
293 v_float32x16() {}
294
295 static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
296
297 float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
298};
299
300 struct v_uint64x8
301{
302 typedef uint64 lane_type;
303 enum { nlanes = 8 };
304 __m512i val;
305
306 explicit v_uint64x8(__m512i v) : val(v) {}
307 v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
308 { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
309 v_uint64x8() {}
310
311 static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
312
313 uint64 get0() const
314 {
315 #if defined __x86_64__ || defined _M_X64
316 return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
317 #else
318 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
319 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
320 return (unsigned)a | ((uint64)(unsigned)b << 32);
321 #endif
322 }
323};
324
325 struct v_int64x8
326{
327 typedef int64 lane_type;
328 enum { nlanes = 8 };
329 __m512i val;
330
331 explicit v_int64x8(__m512i v) : val(v) {}
332 v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
333 { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
334 v_int64x8() {}
335
336 static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
337
338 int64 get0() const
339 {
340 #if defined __x86_64__ || defined _M_X64
341 return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
342 #else
343 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
344 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
345 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
346 #endif
347 }
348};
349
350 struct v_float64x8
351{
352 typedef double lane_type;
353 enum { nlanes = 8 };
354 __m512d val;
355
356 explicit v_float64x8(__m512d v) : val(v) {}
357 v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
358 { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
359 v_float64x8() {}
360
361 static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
362
363 double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
364};
365
367
368 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \
369 inline _Tpvec v512_load(const _Tp* ptr) \
370 { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \
371 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
372 { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \
373 inline _Tpvec v512_load_low(const _Tp* ptr) \
374 { \
375 __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \
376 return _Tpvec(_mm512_castsi256_si512(v256)); \
377 } \
378 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
379 { \
380 __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \
381 __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \
382 return _Tpvec(_v512_combine(vlo, vhi)); \
383 } \
384 inline void v_store(_Tp* ptr, const _Tpvec& a) \
385 { _mm512_storeu_si512((__m512i*)ptr, a.val); } \
386 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
387 { _mm512_store_si512((__m512i*)ptr, a.val); } \
388 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
389 { _mm512_stream_si512((__m512i*)ptr, a.val); } \
390 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
391 { \
392 if( mode == hal::STORE_UNALIGNED ) \
393 _mm512_storeu_si512((__m512i*)ptr, a.val); \
394 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
395 _mm512_stream_si512((__m512i*)ptr, a.val); \
396 else \
397 _mm512_store_si512((__m512i*)ptr, a.val); \
398 } \
399 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
400 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \
401 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
402 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
403
404OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64, uchar)
405OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64, schar)
406OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
407OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32, short)
408OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16, unsigned)
409OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16, int)
410OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8, uint64)
411OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8, int64)
412
413 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
414 inline _Tpvec v512_load(const _Tp* ptr) \
415 { return _Tpvec(_mm512_loadu_##suffix(ptr)); } \
416 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
417 { return _Tpvec(_mm512_load_##suffix(ptr)); } \
418 inline _Tpvec v512_load_low(const _Tp* ptr) \
419 { \
420 return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \
421 (_mm256_loadu_##suffix(ptr))); \
422 } \
423 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
424 { \
425 halfreg vlo = _mm256_loadu_##suffix(ptr0); \
426 halfreg vhi = _mm256_loadu_##suffix(ptr1); \
427 return _Tpvec(_v512_combine(vlo, vhi)); \
428 } \
429 inline void v_store(_Tp* ptr, const _Tpvec& a) \
430 { _mm512_storeu_##suffix(ptr, a.val); } \
431 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
432 { _mm512_store_##suffix(ptr, a.val); } \
433 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
434 { _mm512_stream_##suffix(ptr, a.val); } \
435 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
436 { \
437 if( mode == hal::STORE_UNALIGNED ) \
438 _mm512_storeu_##suffix(ptr, a.val); \
439 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
440 _mm512_stream_##suffix(ptr, a.val); \
441 else \
442 _mm512_store_##suffix(ptr, a.val); \
443 } \
444 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
445 { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \
446 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
447 { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
448
449OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float, ps, __m256)
450OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
451
452 #define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
453 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
454 { return _Tpvec(cast(a.val)); }
455
456 #define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
457 inline _Tpvec v512_setzero_##suffix() \
458 { return _Tpvec(_mm512_setzero_si512()); } \
459 inline _Tpvec v512_setall_##suffix(_Tp v) \
460 { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
461 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
462 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
463 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
464 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \
465 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \
466 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \
467 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \
468 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \
469 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
470 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512)
471
472OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64, uchar, u8, epi8, char)
473OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64, schar, s8, epi8, char)
474OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort, u16, epi16, short)
475OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32, short, s16, epi16, short)
476OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32, int)
477OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16, int, s32, epi32, int)
478OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8, uint64, u64, epi64, int64)
479OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
480
481 #define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
482 inline _Tpvec v512_setzero_##suffix() \
483 { return _Tpvec(_mm512_setzero_##zsuffix()); } \
484 inline _Tpvec v512_setall_##suffix(_Tp v) \
485 { return _Tpvec(_mm512_set1_##zsuffix(v)); } \
486 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
487 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
488 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
489 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \
490 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \
491 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \
492 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \
493 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast)
494
495OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float, f32, ps, _mm512_castsi512_ps)
496OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8, double, f64, pd, _mm512_castsi512_pd)
497
498 inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
499{ return a; }
500 inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
501{ return v_float32x16(_mm512_castpd_ps(a.val)); }
502
503 inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
504{ return a; }
505 inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
506{ return v_float64x8(_mm512_castps_pd(a.val)); }
507
508 // FP16
509 inline v_float32x16 v512_load_expand(const float16_t* ptr)
510{
511 return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
512}
513
514 inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
515{
516 __m256i ah = _mm512_cvtps_ph(a.val, 0);
517 _mm256_storeu_si256((__m256i*)ptr, ah);
518}
519
520 /* Recombine & ZIP */
521 inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
522{
523 #if CV_AVX_512VBMI
524 __m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24,
525 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16,
526 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8,
527 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0);
528 ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
529 __m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
530 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
531 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
532 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
533 ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
534 #else
535 __m512i low = _mm512_unpacklo_epi8(a.val, b.val);
536 __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
537 ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high));
538 ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
539 #endif
540}
541 inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
542{
543 __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8,
544 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
545 ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
546 __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
547 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
548 ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
549}
550 inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
551{
552 __m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
553 ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
554 __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
555 ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
556}
557 inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
558{
559 __m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0);
560 ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
561 __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
562 ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
563}
564
565 inline void v_zip(const v_uint8x64& a, const v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1)
566{
567 v_int8x64 i0, i1;
568 v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
569 ab0 = v_reinterpret_as_u8(i0);
570 ab1 = v_reinterpret_as_u8(i1);
571}
572 inline void v_zip(const v_uint16x32& a, const v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1)
573{
574 v_int16x32 i0, i1;
575 v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
576 ab0 = v_reinterpret_as_u16(i0);
577 ab1 = v_reinterpret_as_u16(i1);
578}
579 inline void v_zip(const v_uint32x16& a, const v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1)
580{
581 v_int32x16 i0, i1;
582 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
583 ab0 = v_reinterpret_as_u32(i0);
584 ab1 = v_reinterpret_as_u32(i1);
585}
586 inline void v_zip(const v_uint64x8& a, const v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1)
587{
588 v_int64x8 i0, i1;
589 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
590 ab0 = v_reinterpret_as_u64(i0);
591 ab1 = v_reinterpret_as_u64(i1);
592}
593 inline void v_zip(const v_float32x16& a, const v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1)
594{
595 v_int32x16 i0, i1;
596 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
597 ab0 = v_reinterpret_as_f32(i0);
598 ab1 = v_reinterpret_as_f32(i1);
599}
600 inline void v_zip(const v_float64x8& a, const v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1)
601{
602 v_int64x8 i0, i1;
603 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
604 ab0 = v_reinterpret_as_f64(i0);
605 ab1 = v_reinterpret_as_f64(i1);
606}
607
608 #define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \
609 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
610 { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
611 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
612 { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \
613 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
614 _Tpvec& c, _Tpvec& d) \
615 { \
616 c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \
617 d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \
618 }
619
620
621OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8)
622OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8)
623OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16)
624OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16)
625OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32)
626OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32)
627OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64)
628OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64)
629OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
630OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd)
631
632
633
634 /* Element-wise binary and unary operations */
635
636
637 #define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
638 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
639 { return _Tpvec(intrin(a.val, b.val)); }
640
641OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
642OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
643OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
644OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
645OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
646OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
647OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
648OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
649OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
650OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
651
652 inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
653{
654 __m512i ad = _mm512_srai_epi16(a.val, 8);
655 __m512i bd = _mm512_srai_epi16(b.val, 8);
656 __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
657 __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
658 return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
659}
660 inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
661{
662 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
663}
664
665 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
666 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
667 { return _Tpvec(intrin(a.val, b.val)); } \
668 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
669 { a.val = intrin(a.val, b.val); return a; }
670
671OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
672OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
673OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
674OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
675OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
676OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
677OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
678OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
679
680OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
681OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
682OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
683OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
684
685
686OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
687OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
688OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
689OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
690OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
691OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
692OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
693OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
694
695OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
696OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
697OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
698OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
699OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
700OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
701OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
702OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
703
704 // saturating multiply
705 inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
706{
707 v_uint16x32 c, d;
708 v_mul_expand(a, b, c, d);
709 return v_pack(c, d);
710}
711 inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
712{
713 v_int16x32 c, d;
714 v_mul_expand(a, b, c, d);
715 return v_pack(c, d);
716}
717 inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
718{
719 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
720 __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
721 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
722 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
723
724 const __m512i m = _mm512_set1_epi32(65535);
725 return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
726}
727 inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
728{
729 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
730 __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
731 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
732 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
733 return v_int16x32(_mm512_packs_epi32(p0, p1));
734}
735
736 inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
737{ a = a * b; return a; }
738 inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
739{ a = a * b; return a; }
740 inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
741{ a = a * b; return a; }
742 inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
743{ a = a * b; return a; }
744
745 inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
746 inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
747
748 // Multiply and expand
749 inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
750 v_uint16x32& c, v_uint16x32& d)
751{
752 v_uint16x32 a0, a1, b0, b1;
753 v_expand(a, a0, a1);
754 v_expand(b, b0, b1);
755 c = v_mul_wrap(a0, b0);
756 d = v_mul_wrap(a1, b1);
757}
758
759 inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
760 v_int16x32& c, v_int16x32& d)
761{
762 v_int16x32 a0, a1, b0, b1;
763 v_expand(a, a0, a1);
764 v_expand(b, b0, b1);
765 c = v_mul_wrap(a0, b0);
766 d = v_mul_wrap(a1, b1);
767}
768
769 inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
770 v_int32x16& c, v_int32x16& d)
771{
772 v_int16x32 v0, v1;
773 v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
774
775 c = v_reinterpret_as_s32(v0);
776 d = v_reinterpret_as_s32(v1);
777}
778
779 inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
780 v_uint32x16& c, v_uint32x16& d)
781{
782 v_uint16x32 v0, v1;
783 v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
784
785 c = v_reinterpret_as_u32(v0);
786 d = v_reinterpret_as_u32(v1);
787}
788
789 inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
790 v_uint64x8& c, v_uint64x8& d)
791{
792 v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
793 v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
794}
795
796 inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
797 v_int64x8& c, v_int64x8& d)
798{
799 v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
800 v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
801}
802
804 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
805 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
806 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
807 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
808 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
809 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
810 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
811 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
812 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
813 template<int imm> \
814 inline _Tpuvec v_shl(const _Tpuvec& a) \
815 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
816 template<int imm> \
817 inline _Tpsvec v_shl(const _Tpsvec& a) \
818 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
819 template<int imm> \
820 inline _Tpuvec v_shr(const _Tpuvec& a) \
821 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
822 template<int imm> \
823 inline _Tpsvec v_shr(const _Tpsvec& a) \
824 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
825
826OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
827OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
828OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
829
830
831
832 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
833 OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
834 OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
835 OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
836 inline _Tpvec operator ~ (const _Tpvec& a) \
837 { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
838
839OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
840OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1))
841OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1))
842OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1))
843OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1))
844OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1))
845OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1))
846OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1))
847OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
848OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
849
851 #define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \
852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
853 { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
854
855OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512)
856OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512)
857OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
858OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512)
859OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
860OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512)
861OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512)
862OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512)
863OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps)
864OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
865
866
867 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
868 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
869 { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
870
871 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
872 OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
873 OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
874 OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
875 OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
876 OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
877 OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
878
879OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
880OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
881OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
882OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (short)-1)
883OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
884OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (int)-1)
885OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
886OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
887
888 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
889 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
890 { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
891
892 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
893 OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
894 OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
895 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
896 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
897 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
898 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
899
900OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
901OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
902
903inline v_float32x16 v_not_nan(const v_float32x16& a)
904{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
905 inline v_float64x8 v_not_nan(const v_float64x8& a)
906{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
907
909OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8)
910OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8)
911OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8)
912OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8)
913OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16)
914OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16)
915OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16)
916OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16)
917OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32)
918OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32)
919OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32)
920OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32)
921OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64)
922OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64)
923OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64)
924OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64)
925OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
926OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
927OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd)
928OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd)
929
930
931 namespace {
932 template<bool prec, int imm4, bool part, int imm32>
933 struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
934 template<int imm4, int imm32>
935 struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
936 {
937 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8),
938 _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
939 }};
940 template<int imm4>
941 struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
942 {
943 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8),
944 _mm512_slli_epi32( b.val, (4-imm4)*8)));
945 }};
946 template<int imm4, int imm32>
947 struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
948 {
949 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8),
950 _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
951 }};
952 template<int imm4>
953 struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
954 { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
955 template<int imm32>
956 struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
957 { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
958 template<>
959 struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
960 template<int imm32>
961 struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
962 { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
963 template<>
964 struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
965 template<>
966 struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
967}
968 template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
969{
970 return imm >= 128 ? v_int8x64() :
971#if CV_AVX_512VBMI
972 v_int8x64(_mm512_permutex2var_epi8(a.val,
973 _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
974 0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
975 0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
976 0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
977 0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
978 0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
979 0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
980 0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
981 #else
982 _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
983 #endif
984}
985 template<int imm>
986 inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
987{
988 if (imm == 0) return a;
989 if (imm == 64) return b;
990 if (imm >= 128) return v_int8x64();
991 #if CV_AVX_512VBMI
992 return v_int8x64(_mm512_permutex2var_epi8(b.val,
993 _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
994 0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
995 0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
996 0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
997 0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
998 0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
999 0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
1000 0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
1001 #else
1002 return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
1003 #endif
1004}
1005 template<int imm>
1006 inline v_int8x64 v_rotate_right(const v_int8x64& a)
1007{
1008 if (imm == 0) return a;
1009 if (imm >= 64) return v_int8x64();
1010 #if CV_AVX_512VBMI
1011 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
1012 _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
1013 0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
1014 0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
1015 0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
1016 0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
1017 0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
1018 0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
1019 0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
1020 #else
1021 return v_rotate_right<imm>(a, v512_setzero_s8());
1022 #endif
1023}
1024 template<int imm>
1025 inline v_int8x64 v_rotate_left(const v_int8x64& a)
1026{
1027 if (imm == 0) return a;
1028 if (imm >= 64) return v_int8x64();
1029 #if CV_AVX_512VBMI
1030 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
1031 _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
1032 0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
1033 0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
1034 0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
1035 0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
1036 0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
1037 0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
1038 0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
1039 #else
1040 return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
1041 #endif
1042}
1043
1044 #define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \
1045 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1046 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1047 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1048 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1049 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1050 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); } \
1051 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1052 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
1053
1054 #define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \
1055 template<int imm> \
1056 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1057 { \
1058 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1059 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1060 if (imm == 0) return a; \
1061 if (imm == _Tpvec::nlanes) return b; \
1062 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1063 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
1064 } \
1065 template<int imm> \
1066 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1067 { \
1068 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1069 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1070 if (imm == 0) return a; \
1071 if (imm == _Tpvec::nlanes) return b; \
1072 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1073 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
1074 } \
1075 template<int imm> \
1076 inline _Tpvec v_rotate_left(const _Tpvec& a) \
1077 { \
1078 if (imm == 0) return a; \
1079 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1080 return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1081 } \
1082 template<int imm> \
1083 inline _Tpvec v_rotate_right(const _Tpvec& a) \
1084 { \
1085 if (imm == 0) return a; \
1086 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1087 return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1088 }
1089
1090OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8)
1091OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16)
1092OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16)
1093OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32)
1094OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32)
1095OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64)
1096OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
1097OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
1098OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
1099
1100
1101 inline v_uint8x64 v_reverse(const v_uint8x64 &a)
1102{
1103 #if CV_AVX_512VBMI
1104 static const __m512i perm = _mm512_set_epi32(
1105 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1106 0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
1107 0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
1108 0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1109 return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
1110 #else
1111 static const __m512i shuf = _mm512_set_epi32(
1112 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1113 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1114 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1115 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1116 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1117 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1118 return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
1119 #endif
1120}
1121
1122 inline v_int8x64 v_reverse(const v_int8x64 &a)
1123{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1124
1125 inline v_uint16x32 v_reverse(const v_uint16x32 &a)
1126{
1127 #if CV_AVX_512VBMI
1128 static const __m512i perm = _mm512_set_epi32(
1129 0x00000001, 0x00020003, 0x00040005, 0x00060007,
1130 0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
1131 0x00100011, 0x00120013, 0x00140015, 0x00160017,
1132 0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
1133 return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
1134 #else
1135 static const __m512i shuf = _mm512_set_epi32(
1136 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1137 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1138 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1139 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
1140 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1141 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1142 return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
1143 #endif
1144}
1145
1146 inline v_int16x32 v_reverse(const v_int16x32 &a)
1147{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1148
1149 inline v_uint32x16 v_reverse(const v_uint32x16 &a)
1150{
1151 static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
1152 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
1153}
1154
1155 inline v_int32x16 v_reverse(const v_int32x16 &a)
1156{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1157
1158 inline v_float32x16 v_reverse(const v_float32x16 &a)
1159{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1160
1161 inline v_uint64x8 v_reverse(const v_uint64x8 &a)
1162{
1163 static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
1164 return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
1165}
1166
1167 inline v_int64x8 v_reverse(const v_int64x8 &a)
1168{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1169
1170 inline v_float64x8 v_reverse(const v_float64x8 &a)
1171{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1172
1174
1176 #define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
1177 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \
1178 inline sctype v_reduce_##func(const _Tpvec& a) \
1179 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1180 sctype CV_DECL_ALIGNED(64) idx[2]; \
1181 _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
1182 return scop(idx[0], idx[1]); }
1183OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
1184OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
1185OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1186OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, min, v_int64x8, min_epi64, min)
1187OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, max, v_int64x8, max_epi64, max)
1188OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1189
1190 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \
1191 inline double v_reduce_##func(const v_float64x8& a) \
1192 { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1193 double CV_DECL_ALIGNED(64) idx[2]; \
1194 _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
1195 return scop(idx[0], idx[1]); }
1196OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
1197OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
1198OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1199
1200 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \
1201 inline sctype v_reduce_##func(const _Tpvec& a) \
1202 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1203 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1204 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1205 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1206 return (sctype)_mm_cvtsi128_si32(quarter); }
1207OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
1208OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
1209OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, min, v_int32x16, min_epi32)
1210OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, max, v_int32x16, max_epi32)
1211
1212 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \
1213 inline float v_reduce_##func(const v_float32x16& a) \
1214 { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1215 __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
1216 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \
1217 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \
1218 return _mm_cvtss_f32(quarter); }
1219OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
1220OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
1221
1222 inline float v_reduce_sum(const v_float32x16& a)
1223{
1224 __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
1225 __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
1226 quarter = _mm_hadd_ps(quarter, quarter);
1227 return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
1228}
1229 inline int v_reduce_sum(const v_int32x16& a)
1230{
1231 __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
1232 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1233 quarter = _mm_hadd_epi32(quarter, quarter);
1234 return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
1235}
1236 inline uint v_reduce_sum(const v_uint32x16& a)
1237{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
1238
1239 #define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \
1240 inline sctype v_reduce_##func(const _Tpvec& a) \
1241 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1242 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1243 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1244 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1245 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1246 return (sctype)_mm_cvtsi128_si32(quarter); }
1247OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
1248OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
1249OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
1250OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
1251
1252 inline int v_reduce_sum(const v_int16x32& a)
1253{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1254 inline uint v_reduce_sum(const v_uint16x32& a)
1255{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1256
1257 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
1258 inline sctype v_reduce_##func(const _Tpvec& a) \
1259 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1260 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1261 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1262 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1263 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1264 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \
1265 return (sctype)_mm_cvtsi128_si32(quarter); }
1266OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
1267OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
1268OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64, min_epi8)
1269OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64, max_epi8)
1270
1271 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \
1272 inline sctype v_reduce_sum(const _Tpvec& a) \
1273 { __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \
1274 _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \
1275 a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
1276 __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \
1277 __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \
1278 a4 = _mm_hadd_epi32(a4, a4); \
1279 return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
1280OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
1281OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int, v_int8x64, epi8)
1282
1283 inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
1284 const v_float32x16& c, const v_float32x16& d)
1285{
1286 __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
1287 __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
1288 __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
1289 __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
1290 return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
1291}
1292
1293 inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
1294{
1295 __m512i val = _mm512_sad_epu8(a.val, b.val);
1296 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1297 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1298 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1299}
1300 inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
1301{
1302 __m512i val = _mm512_set1_epi8(-128);
1303 val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
1304 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1305 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1306 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1307}
1308 inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
1309{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
1310 inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
1311{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
1312 inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
1313{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
1314 inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
1315{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
1316 inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
1317{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
1318 inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
1319{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
1320
1322 inline v_uint8x64 v_popcount(const v_int8x64& a)
1323{
1324 #if CV_AVX_512BITALG
1325 return v_uint8x64(_mm512_popcnt_epi8(a.val));
1326 #elif CV_AVX_512VBMI
1327 __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1328 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1329 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1330 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
1331 __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1332 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1333 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1334 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
1335 return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
1336 #else
1337 __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
1338 __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
1339
1340 return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)),
1341 _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
1342 #endif
1343}
1344 inline v_uint16x32 v_popcount(const v_int16x32& a)
1345{
1346 #if CV_AVX_512BITALG
1347 return v_uint16x32(_mm512_popcnt_epi16(a.val));
1348 #elif CV_AVX_512VPOPCNTDQ
1349 __m512i zero = _mm512_setzero_si512();
1350 return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
1351 _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
1352 #else
1353 v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1354 p += v_rotate_right<1>(p);
1355 return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
1356 #endif
1357}
1358 inline v_uint32x16 v_popcount(const v_int32x16& a)
1359{
1360 #if CV_AVX_512VPOPCNTDQ
1361 return v_uint32x16(_mm512_popcnt_epi32(a.val));
1362 #else
1363 v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1364 p += v_rotate_right<1>(p);
1365 p += v_rotate_right<2>(p);
1366 return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
1367 #endif
1368}
1369 inline v_uint64x8 v_popcount(const v_int64x8& a)
1370{
1371 #if CV_AVX_512VPOPCNTDQ
1372 return v_uint64x8(_mm512_popcnt_epi64(a.val));
1373 #else
1374 return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
1375 #endif
1376}
1377
1378
1379 inline v_uint8x64 v_popcount(const v_uint8x64& a) { return v_popcount(v_reinterpret_as_s8 (a)); }
1380 inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
1381 inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
1382 inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinterpret_as_s64(a)); }
1383
1384
1386
1388 #define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1389 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1390 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1391 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1392 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1393 inline _Tpvec v_sqrt(const _Tpvec& x) \
1394 { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
1395 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1396 { return v_fma(a, a, b * b); } \
1397 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1398 { return v_sqrt(v_fma(a, a, b * b)); }
1399
1400OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
1401OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
1402
1403 inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1404{ return a * b + c; }
1405 inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1406{ return v_fma(a, b, c); }
1407
1408 inline v_float32x16 v_invsqrt(const v_float32x16& x)
1409{
1410 #if CV_AVX_512ER
1411 return v_float32x16(_mm512_rsqrt28_ps(x.val));
1412 #else
1413 v_float32x16 half = x * v512_setall_f32(0.5);
1414 v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
1415 t *= v512_setall_f32(1.5) - ((t * t) * half);
1416 return t;
1417 #endif
1418}
1419
1420 inline v_float64x8 v_invsqrt(const v_float64x8& x)
1421{
1422 #if CV_AVX_512ER
1423 return v_float64x8(_mm512_rsqrt28_pd(x.val));
1424 #else
1425 return v512_setall_f64(1.) / v_sqrt(x);
1426 // v_float64x8 half = x * v512_setall_f64(0.5);
1427 // v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
1428 // t *= v512_setall_f64(1.5) - ((t * t) * half);
1429 // t *= v512_setall_f64(1.5) - ((t * t) * half);
1430 // return t;
1431 #endif
1432}
1433
1435 #define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
1436 inline _Tpuvec v_abs(const _Tpvec& x) \
1437 { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
1438
1439OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8)
1440OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16)
1441OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32)
1442OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64)
1443
1444 inline v_float32x16 v_abs(const v_float32x16& x)
1445{
1446 #ifdef _mm512_abs_pd
1447 return v_float32x16(_mm512_abs_ps(x.val));
1448 #else
1449 return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
1450 _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
1451 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
1452 #endif
1453}
1454
1455 inline v_float64x8 v_abs(const v_float64x8& x)
1456{
1457 #ifdef _mm512_abs_pd
1458 #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
1459 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
1460 return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
1461 #else
1462 return v_float64x8(_mm512_abs_pd(x.val));
1463 #endif
1464 #else
1465 return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
1466 _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
1467 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
1468 #endif
1469}
1470
1472 inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
1473{ return v_add_wrap(a - b, b - a); }
1474 inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
1475{ return v_add_wrap(a - b, b - a); }
1476 inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
1477{ return v_max(a, b) - v_min(a, b); }
1478
1479 inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
1480{
1481 v_int8x64 d = v_sub_wrap(a, b);
1482 v_int8x64 m = a < b;
1483 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1484}
1485
1486 inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
1487{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1488
1489 inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
1490{
1491 v_int32x16 d = a - b;
1492 v_int32x16 m = a < b;
1493 return v_reinterpret_as_u32((d ^ m) - m);
1494}
1495
1496 inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
1497{ return v_abs(a - b); }
1498
1499 inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
1500{ return v_abs(a - b); }
1501
1503 inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
1504{
1505 v_int8x64 d = a - b;
1506 v_int8x64 m = a < b;
1507 return (d ^ m) - m;
1508}
1509 inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
1510{ return v_max(a, b) - v_min(a, b); }
1511
1513
1515 inline v_int32x16 v_round(const v_float32x16& a)
1516{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
1517
1518 inline v_int32x16 v_round(const v_float64x8& a)
1519{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
1520
1521 inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
1522{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
1523
1524 inline v_int32x16 v_trunc(const v_float32x16& a)
1525{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
1526
1527 inline v_int32x16 v_trunc(const v_float64x8& a)
1528{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
1529
1530 #if CVT_ROUND_MODES_IMPLEMENTED
1531 inline v_int32x16 v_floor(const v_float32x16& a)
1532{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
1533
1534 inline v_int32x16 v_floor(const v_float64x8& a)
1535{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
1536
1537 inline v_int32x16 v_ceil(const v_float32x16& a)
1538{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
1539
1540 inline v_int32x16 v_ceil(const v_float64x8& a)
1541{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
1542 #else
1543 inline v_int32x16 v_floor(const v_float32x16& a)
1544{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
1545
1546 inline v_int32x16 v_floor(const v_float64x8& a)
1547{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
1548
1549 inline v_int32x16 v_ceil(const v_float32x16& a)
1550{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
1551
1552 inline v_int32x16 v_ceil(const v_float64x8& a)
1553{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
1554 #endif
1555
1557 inline v_float32x16 v_cvt_f32(const v_int32x16& a)
1558{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
1559
1560 inline v_float32x16 v_cvt_f32(const v_float64x8& a)
1561{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
1562
1563 inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
1564{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
1565
1566 inline v_float64x8 v_cvt_f64(const v_int32x16& a)
1567{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
1568
1569 inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
1570{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
1571
1572 inline v_float64x8 v_cvt_f64(const v_float32x16& a)
1573{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
1574
1575 inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
1576{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
1577
1578 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
1579 inline v_float64x8 v_cvt_f64(const v_int64x8& v)
1580{
1581 #if CV_AVX_512DQ
1582 return v_float64x8(_mm512_cvtepi64_pd(v.val));
1583 #else
1584 // constants encoded as floating-point
1585 __m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000); // 2^52
1586 __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
1587 __m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
1588 __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
1589
1590 // Blend the 32 lowest significant bits of v with magic_int_lo
1591 __m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
1592 // Extract the 32 most significant bits of v
1593 __m512i v_hi = _mm512_srli_epi64(v.val, 32);
1594 // Flip the msb of v_hi and blend with 0x45300000
1595 v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
1596 // Compute in double precision
1597 __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
1598 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1599 __m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
1600 return v_float64x8(result);
1601 #endif
1602}
1603
1605
1606 inline v_int8x64 v512_lut(const schar* tab, const int* idx)
1607{
1608 __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1609 __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1610 __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
1611 __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
1612 return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
1613}
1614 inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
1615{
1616 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1617 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1618 return v_int8x64(_v512_combine(p0, p1));
1619}
1620 inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
1621{
1622 return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
1623}
1624 inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
1625 inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
1626 inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
1627
1628 inline v_int16x32 v512_lut(const short* tab, const int* idx)
1629{
1630 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 2));
1631 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
1632 return v_int16x32(_v512_combine(p0, p1));
1633}
1634 inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
1635{
1636 return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
1637}
1638 inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
1639{
1640 #if defined(__GNUC__)
1641 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
1642 #else
1643 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
1644 #endif
1645}
1646 inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
1647 inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
1648 inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
1649
1650 inline v_int32x16 v512_lut(const int* tab, const int* idx)
1651{
1652 return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1653}
1654 inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
1655{
1656 #if defined(__GNUC__)
1657 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
1658 #else
1659 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
1660 #endif
1661}
1662 inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
1663{
1664 return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1665 _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1666 _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1667 _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1668 _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1669}
1670 inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
1671 inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
1672 inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
1673
1674 inline v_int64x8 v512_lut(const int64* tab, const int* idx)
1675{
1676 #if defined(__GNUC__)
1677 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
1678 #else
1679 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
1680 #endif
1681}
1682 inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
1683{
1684 return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1685 _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1686 _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1687 _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1688 _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1689}
1690 inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
1691 inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
1692
1693 inline v_float32x16 v512_lut(const float* tab, const int* idx)
1694{
1695 return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1696}
1697 inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
1698 inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
1699
1700 inline v_float64x8 v512_lut(const double* tab, const int* idx)
1701{
1702 return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
1703}
1704 inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
1705{
1706 return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
1707 _mm_loadu_pd(tab + idx[0])),
1708 _mm_loadu_pd(tab + idx[1]), 1),
1709 _mm_loadu_pd(tab + idx[2]), 2),
1710 _mm_loadu_pd(tab + idx[3]), 3));
1711}
1712
1713 inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
1714{
1715 return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
1716}
1717
1718 inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
1719{
1720 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1721}
1722
1723 inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
1724{
1725 return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
1726}
1727
1728 inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
1729{
1730 return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
1731}
1732
1733 inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
1734{
1735 x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
1736 y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
1737}
1738
1739 inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
1740{
1741 x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
1742 y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
1743}
1744
1745 inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
1746{
1747 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
1748}
1749 inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1750 inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
1751{
1752 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
1753}
1754 inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1755
1756 inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
1757{
1758 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
1759}
1760 inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1761 inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
1762{
1763 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
1764}
1765 inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1766
1767 inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
1768{
1769 return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
1770}
1771 inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1772 inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1773
1774 inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
1775{
1776 return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1777 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
1778 _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
1779}
1780 inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1781
1782 inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
1783{
1784 return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
1785 0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
1786}
1787 inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1788
1789 inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
1790{
1791 return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1792 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1793}
1794 inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1795 inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
1796{
1797 return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1798 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1799}
1800
1802
1804
1805 // 16 >> 32
1806 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
1807{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
1808 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1809{ return v_dotprod(a, b) + c; }
1810
1811 // 32 >> 64
1812 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
1813{
1814 __m512i even = _mm512_mul_epi32(a.val, b.val);
1815 __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
1816 return v_int64x8(_mm512_add_epi64(even, odd));
1817}
1818 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1819{ return v_dotprod(a, b) + c; }
1820
1821 // 8 >> 32
1822 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
1823{
1824 __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
1825 __m512i odd_a = _mm512_srli_epi16(a.val, 8);
1826
1827 __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
1828 __m512i odd_b = _mm512_srli_epi16(b.val, 8);
1829
1830 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1831 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1832 return v_uint32x16(_mm512_add_epi32(prod0, prod1));
1833}
1834 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1835{ return v_dotprod_expand(a, b) + c; }
1836
1837 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
1838{
1839 __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
1840 __m512i odd_a = _mm512_srai_epi16(a.val, 8);
1841
1842 __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
1843 __m512i odd_b = _mm512_srai_epi16(b.val, 8);
1844
1845 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1846 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1847 return v_int32x16(_mm512_add_epi32(prod0, prod1));
1848}
1849 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1850{ return v_dotprod_expand(a, b) + c; }
1851
1852 // 16 >> 64
1853 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
1854{
1855 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1856 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1857 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1858 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1859
1860 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1861 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1862 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1863 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1864
1865 __m512i p15_ = _mm512_add_epi64(p02, p13);
1866 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1867
1868 return v_uint64x8(_mm512_add_epi64(
1869 _mm512_unpacklo_epi64(p15_, p9d_),
1870 _mm512_unpackhi_epi64(p15_, p9d_)
1871 ));
1872}
1873 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1874{ return v_dotprod_expand(a, b) + c; }
1875
1876 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
1877{
1878 __m512i prod = _mm512_madd_epi16(a.val, b.val);
1879 __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
1880 __m512i odd = _mm512_srai_epi64(prod, 32);
1881 return v_int64x8(_mm512_add_epi64(even, odd));
1882}
1883 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1884{ return v_dotprod_expand(a, b) + c; }
1885
1886 // 32 >> 64f
1887 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
1888{ return v_cvt_f64(v_dotprod(a, b)); }
1889 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1890{ return v_dotprod_expand(a, b) + c; }
1891
1893
1894 // 16 >> 32
1895 inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
1896{ return v_dotprod(a, b); }
1897 inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1898{ return v_dotprod(a, b, c); }
1899
1900 // 32 >> 64
1901 inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
1902{ return v_dotprod(a, b); }
1903 inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1904{ return v_dotprod(a, b, c); }
1905
1906 // 8 >> 32
1907 inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
1908{ return v_dotprod_expand(a, b); }
1909 inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1910{ return v_dotprod_expand(a, b, c); }
1911
1912 inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
1913{ return v_dotprod_expand(a, b); }
1914 inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1915{ return v_dotprod_expand(a, b, c); }
1916
1917 // 16 >> 64
1918 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
1919{
1920 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1921 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1922 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1923 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1924
1925 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1926 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1927 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1928 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1929
1930 __m512i p15_ = _mm512_add_epi64(p02, p13);
1931 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1932 return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
1933}
1934 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1935{ return v_dotprod_expand_fast(a, b) + c; }
1936
1937 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
1938{ return v_dotprod_expand(a, b); }
1939 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1940{ return v_dotprod_expand(a, b, c); }
1941
1942 // 32 >> 64f
1943 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
1944{ return v_dotprod_expand(a, b); }
1945 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1946{ return v_dotprod_expand(a, b) + c; }
1947
1948
1949 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
1950 v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1951
1952 inline v_float32x16 v_matmul(const v_float32x16& v,
1953 const v_float32x16& m0, const v_float32x16& m1,
1954 const v_float32x16& m2, const v_float32x16& m3)
1955{
1956 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1957 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1958 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1959 v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
1960 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1961}
1962
1963 inline v_float32x16 v_matmuladd(const v_float32x16& v,
1964 const v_float32x16& m0, const v_float32x16& m1,
1965 const v_float32x16& m2, const v_float32x16& a)
1966{
1967 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1968 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1969 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1970 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1971}
1972
1973 #define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1974 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1975 const _Tpvec& a2, const _Tpvec& a3, \
1976 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1977 { \
1978 __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \
1979 __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \
1980 __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \
1981 __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \
1982 b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \
1983 b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \
1984 b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \
1985 b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \
1986 }
1987
1988OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1989OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1990OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
1991
1992
1993
1994 /* Expand */
1995 #define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1996 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1997 { \
1998 b0.val = intrin(_v512_extract_low(a.val)); \
1999 b1.val = intrin(_v512_extract_high(a.val)); \
2000 } \
2001 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2002 { return _Tpwvec(intrin(_v512_extract_low(a.val))); } \
2003 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2004 { return _Tpwvec(intrin(_v512_extract_high(a.val))); } \
2005 inline _Tpwvec v512_load_expand(const _Tp* ptr) \
2006 { \
2007 __m256i a = _mm256_loadu_si256((const __m256i*)ptr); \
2008 return _Tpwvec(intrin(a)); \
2009 }
2010
2011OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32, uchar, _mm512_cvtepu8_epi16)
2012OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32, schar, _mm512_cvtepi8_epi16)
2013OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort, _mm512_cvtepu16_epi32)
2014OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16, short, _mm512_cvtepi16_epi32)
2015OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8, unsigned, _mm512_cvtepu32_epi64)
2016OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8, int, _mm512_cvtepi32_epi64)
2017
2018 #define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
2019 inline _Tpvec v512_load_expand_q(const _Tp* ptr) \
2020 { \
2021 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2022 return _Tpvec(intrin(a)); \
2023 }
2024
2025OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
2026OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32)
2027
2028 /* pack */
2029 // 16
2030 inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
2031{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2032
2033 inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
2034{
2035 const __m512i t = _mm512_set1_epi16(255);
2036 return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
2037}
2038
2039 inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
2040{
2041 return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
2042}
2043
2044 inline void v_pack_store(schar* ptr, const v_int16x32& a)
2045{ v_store_low(ptr, v_pack(a, a)); }
2046
2047 inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
2048{
2049 const __m512i m = _mm512_set1_epi16(255);
2050 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
2051}
2052
2053 inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
2054{ v_store_low(ptr, v_pack_u(a, a)); }
2055
2056 template<int n> inline
2057v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
2058{
2059 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2060 v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2061 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2062 v_reinterpret_as_s16((b + delta) >> n));
2063}
2064
2065 template<int n> inline
2066 void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
2067{
2068 v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2069 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2070}
2071
2072 template<int n> inline
2073v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
2074{
2075 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2076 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2077}
2078
2079 template<int n> inline
2080 void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
2081{
2082 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2083 v_pack_u_store(ptr, (a + delta) >> n);
2084}
2085
2086 template<int n> inline
2087v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
2088{
2089 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2090 return v_pack((a + delta) >> n, (b + delta) >> n);
2091}
2092
2093 template<int n> inline
2094 void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
2095{
2096 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2097 v_pack_store(ptr, (a + delta) >> n);
2098}
2099
2100 // 32
2101 inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
2102{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
2103
2104 inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
2105{
2106 const __m512i m = _mm512_set1_epi32(65535);
2107 return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
2108}
2109
2110 inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
2111{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
2112
2113 inline void v_pack_store(short* ptr, const v_int32x16& a)
2114{ v_store_low(ptr, v_pack(a, a)); }
2115
2116 inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
2117{
2118 const __m512i m = _mm512_set1_epi32(65535);
2119 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
2120}
2121
2122 inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
2123{ v_store_low(ptr, v_pack_u(a, a)); }
2124
2125
2126 template<int n> inline
2127v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
2128{
2129 v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2130 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2131 v_reinterpret_as_s32((b + delta) >> n));
2132}
2133
2134 template<int n> inline
2135 void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
2136{
2137 v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2138 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2139}
2140
2141 template<int n> inline
2142v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
2143{
2144 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2145 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2146}
2147
2148 template<int n> inline
2149 void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
2150{
2151 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2152 v_pack_u_store(ptr, (a + delta) >> n);
2153}
2154
2155 template<int n> inline
2156v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
2157{
2158 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2159 return v_pack((a + delta) >> n, (b + delta) >> n);
2160}
2161
2162 template<int n> inline
2163 void v_rshr_pack_store(short* ptr, const v_int32x16& a)
2164{
2165 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2166 v_pack_store(ptr, (a + delta) >> n);
2167}
2168
2169 // 64
2170 // Non-saturating pack
2171 inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
2172{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
2173
2174 inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
2175{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2176
2177 inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
2178{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
2179
2180 inline void v_pack_store(int* ptr, const v_int64x8& b)
2181{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2182
2183 template<int n> inline
2184v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
2185{
2186 v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2187 return v_pack((a + delta) >> n, (b + delta) >> n);
2188}
2189
2190 template<int n> inline
2191 void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
2192{
2193 v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2194 v_pack_store(ptr, (a + delta) >> n);
2195}
2196
2197 template<int n> inline
2198v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
2199{
2200 v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2201 return v_pack((a + delta) >> n, (b + delta) >> n);
2202}
2203
2204 template<int n> inline
2205 void v_rshr_pack_store(int* ptr, const v_int64x8& a)
2206{
2207 v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2208 v_pack_store(ptr, (a + delta) >> n);
2209}
2210
2211 // pack boolean
2212 inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
2213{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2214
2215 inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
2216 const v_uint32x16& c, const v_uint32x16& d)
2217{
2218 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2219 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2220
2221 return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
2222}
2223
2224 inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2225 const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
2226 const v_uint64x8& g, const v_uint64x8& h)
2227{
2228 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2229 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2230 __m512i ef = _mm512_packs_epi32(e.val, f.val);
2231 __m512i gh = _mm512_packs_epi32(g.val, h.val);
2232
2233 __m512i abcd = _mm512_packs_epi32(ab, cd);
2234 __m512i efgh = _mm512_packs_epi32(ef, gh);
2235
2236 return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
2237 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh)));
2238}
2239
2240 /* Recombine */
2241 // its up there with load and store operations
2242
2243 /* Extract */
2244 #define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \
2245 template<int s> \
2246 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2247 { return v_rotate_right<s>(a, b); }
2248
2249OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
2250OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
2251OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
2252OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
2253OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
2254OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
2255OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
2256OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
2257OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
2258OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
2259
2260 #define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
2261 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
2262
2263OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
2264OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
2265OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
2266OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
2267OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
2268OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
2269OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
2270OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
2271OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
2272OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
2273
2274 template<int i>
2275 inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
2276{
2277 static const __m512i perm = _mm512_set1_epi32((char)i);
2278 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
2279}
2280
2281 template<int i>
2282 inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
2283{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2284
2285 template<int i>
2286 inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
2287{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2288
2289
2291
2292 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
2293{
2294 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2295 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2296 #if CV_AVX_512VBMI
2297 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2298 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2299 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2300 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2301 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2302 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2303 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2304 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2305 a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
2306 b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
2307 #else
2308 __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
2309 __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
2310 __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
2311 __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2312 __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2313 a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
2314 b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
2315 #endif
2316}
2317
2318 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
2319{
2320 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2321 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2322 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2323 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2324 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2325 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2326 a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
2327 b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
2328}
2329
2330 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
2331{
2332 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2333 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2334 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2335 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2336 a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
2337 b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
2338}
2339
2340 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
2341{
2342 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2343 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2344 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2345 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2346 a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
2347 b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
2348}
2349
2350 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
2351{
2352 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2353 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2354 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2355
2356 #if CV_AVX_512VBMI2
2357 __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81,
2358 78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33,
2359 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50,
2360 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
2361 __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
2362 __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
2363 __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
2364 _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2365 77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
2366 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49,
2367 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2);
2368 a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
2369 b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
2370 c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
2371 #elif CV_AVX_512VBMI
2372 __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
2373 __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
2374 __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
2375 a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2376 77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48,
2377 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
2378 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2));
2379 b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40,
2380 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17,
2381 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114,
2382 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0));
2383 c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18,
2384 15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98,
2385 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50,
2386 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1));
2387 #else
2388 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2389 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2390 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2391 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2392 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2393
2394 __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
2395 __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2396 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
2397 __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
2398 a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
2399 c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
2400 b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
2401 #endif
2402}
2403
2404 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
2405{
2406 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2407 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2408 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2409
2410 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2411 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2412 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2413 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2414 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2415
2416 a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
2417 b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2418 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
2419 c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
2420}
2421
2422 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
2423{
2424 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2425 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2426 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2427
2428 __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2429 __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
2430 __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
2431 __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
2432
2433 a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
2434 b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
2435 c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
2436}
2437
2438 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
2439{
2440 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2441 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2442 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2443
2444 __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
2445 __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
2446 __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
2447 __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
2448
2449 a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
2450 c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
2451 b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
2452}
2453
2454 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
2455{
2456 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2457 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2458 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2459 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
2460
2461 #if CV_AVX_512VBMI
2462 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2463 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2464 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2465 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2466 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2467 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2468 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2469 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2470
2471 __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
2472 __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
2473 __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
2474 __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
2475
2476 a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
2477 c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
2478 b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
2479 d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
2480 #else
2481 __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
2482 __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
2483 __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
2484 __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
2485 __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
2486
2487 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2488 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2489
2490 __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
2491 __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
2492 __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
2493 __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
2494
2495 a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
2496 c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
2497 b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2498 d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2499 #endif
2500}
2501
2502 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
2503{
2504 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2505 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2506 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2507 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
2508
2509 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2510 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2511 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2512 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2513
2514 __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
2515 __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
2516 __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
2517 __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
2518
2519 a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
2520 c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
2521 b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
2522 d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
2523}
2524
2525 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
2526{
2527 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2528 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2529 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2530 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
2531
2532 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2533 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2534
2535 __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
2536 __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
2537 __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
2538 __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
2539
2540 a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
2541 c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
2542 b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2543 d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2544}
2545
2546 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
2547{
2548 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2549 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2550 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2551 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
2552
2553 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2554 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2555
2556 __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
2557 __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
2558 __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
2559 __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
2560
2561 a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
2562 c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
2563 b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
2564 d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
2565}
2566
2568
2569 inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
2570 hal::StoreMode mode=hal::STORE_UNALIGNED )
2571{
2572 v_uint8x64 low, high;
2573 v_zip(x, y, low, high);
2574 if( mode == hal::STORE_ALIGNED_NOCACHE )
2575 {
2576 _mm512_stream_si512((__m512i*)ptr, low.val);
2577 _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
2578 }
2579 else if( mode == hal::STORE_ALIGNED )
2580 {
2581 _mm512_store_si512((__m512i*)ptr, low.val);
2582 _mm512_store_si512((__m512i*)(ptr + 64), high.val);
2583 }
2584 else
2585 {
2586 _mm512_storeu_si512((__m512i*)ptr, low.val);
2587 _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
2588 }
2589}
2590
2591 inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
2592 hal::StoreMode mode=hal::STORE_UNALIGNED )
2593{
2594 v_uint16x32 low, high;
2595 v_zip(x, y, low, high);
2596 if( mode == hal::STORE_ALIGNED_NOCACHE )
2597 {
2598 _mm512_stream_si512((__m512i*)ptr, low.val);
2599 _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
2600 }
2601 else if( mode == hal::STORE_ALIGNED )
2602 {
2603 _mm512_store_si512((__m512i*)ptr, low.val);
2604 _mm512_store_si512((__m512i*)(ptr + 32), high.val);
2605 }
2606 else
2607 {
2608 _mm512_storeu_si512((__m512i*)ptr, low.val);
2609 _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
2610 }
2611}
2612
2613 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
2614 hal::StoreMode mode=hal::STORE_UNALIGNED )
2615{
2616 v_uint32x16 low, high;
2617 v_zip(x, y, low, high);
2618 if( mode == hal::STORE_ALIGNED_NOCACHE )
2619 {
2620 _mm512_stream_si512((__m512i*)ptr, low.val);
2621 _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
2622 }
2623 else if( mode == hal::STORE_ALIGNED )
2624 {
2625 _mm512_store_si512((__m512i*)ptr, low.val);
2626 _mm512_store_si512((__m512i*)(ptr + 16), high.val);
2627 }
2628 else
2629 {
2630 _mm512_storeu_si512((__m512i*)ptr, low.val);
2631 _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
2632 }
2633}
2634
2635 inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
2636 hal::StoreMode mode=hal::STORE_UNALIGNED )
2637{
2638 v_uint64x8 low, high;
2639 v_zip(x, y, low, high);
2640 if( mode == hal::STORE_ALIGNED_NOCACHE )
2641 {
2642 _mm512_stream_si512((__m512i*)ptr, low.val);
2643 _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
2644 }
2645 else if( mode == hal::STORE_ALIGNED )
2646 {
2647 _mm512_store_si512((__m512i*)ptr, low.val);
2648 _mm512_store_si512((__m512i*)(ptr + 8), high.val);
2649 }
2650 else
2651 {
2652 _mm512_storeu_si512((__m512i*)ptr, low.val);
2653 _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
2654 }
2655}
2656
2657 inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
2658 hal::StoreMode mode=hal::STORE_UNALIGNED )
2659{
2660 #if CV_AVX_512VBMI
2661 __m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122,
2662 79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74,
2663 10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5,
2664 111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106);
2665 __m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16,
2666 37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32,
2667 95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90,
2668 5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0);
2669 __m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101,
2670 122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117,
2671 53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48,
2672 90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85);
2673 __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
2674 __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
2675 __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
2676
2677 __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
2678 __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
2679 __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
2680 #else
2681 __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
2682 __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
2683 __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
2684 __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
2685
2686 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2687 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2688 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2689 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2690 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2691 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2692 __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
2693 __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
2694 __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
2695
2696 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2697 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2698 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2699 #endif
2700
2701 if( mode == hal::STORE_ALIGNED_NOCACHE )
2702 {
2703 _mm512_stream_si512((__m512i*)ptr, bgr0);
2704 _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
2705 _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
2706 }
2707 else if( mode == hal::STORE_ALIGNED )
2708 {
2709 _mm512_store_si512((__m512i*)ptr, bgr0);
2710 _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
2711 _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
2712 }
2713 else
2714 {
2715 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2716 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
2717 _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
2718 }
2719}
2720
2721 inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
2722 hal::StoreMode mode=hal::STORE_UNALIGNED )
2723{
2724 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2725 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2726 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2727 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2728 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2729 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2730 __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
2731 __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
2732 __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
2733
2734 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2735 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2736 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2737
2738 if( mode == hal::STORE_ALIGNED_NOCACHE )
2739 {
2740 _mm512_stream_si512((__m512i*)ptr, bgr0);
2741 _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
2742 _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
2743 }
2744 else if( mode == hal::STORE_ALIGNED )
2745 {
2746 _mm512_store_si512((__m512i*)ptr, bgr0);
2747 _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
2748 _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
2749 }
2750 else
2751 {
2752 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2753 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
2754 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
2755 }
2756}
2757
2758 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
2759 hal::StoreMode mode=hal::STORE_UNALIGNED )
2760{
2761 __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
2762 __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26);
2763 __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
2764 __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
2765
2766 __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
2767 __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
2768 __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
2769
2770 if( mode == hal::STORE_ALIGNED_NOCACHE )
2771 {
2772 _mm512_stream_si512((__m512i*)ptr, bgr0);
2773 _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
2774 _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
2775 }
2776 else if( mode == hal::STORE_ALIGNED )
2777 {
2778 _mm512_store_si512((__m512i*)ptr, bgr0);
2779 _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
2780 _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
2781 }
2782 else
2783 {
2784 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2785 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
2786 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
2787 }
2788}
2789
2790 inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2791 hal::StoreMode mode=hal::STORE_UNALIGNED )
2792{
2793 __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10);
2794 __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5);
2795 __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
2796 __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
2797
2798 __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
2799 __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
2800 __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
2801
2802 if( mode == hal::STORE_ALIGNED_NOCACHE )
2803 {
2804 _mm512_stream_si512((__m512i*)ptr, bgr0);
2805 _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
2806 _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
2807 }
2808 else if( mode == hal::STORE_ALIGNED )
2809 {
2810 _mm512_store_si512((__m512i*)ptr, bgr0);
2811 _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
2812 _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
2813 }
2814 else
2815 {
2816 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2817 _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
2818 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
2819 }
2820}
2821
2822 inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
2823 const v_uint8x64& c, const v_uint8x64& d,
2824 hal::StoreMode mode=hal::STORE_UNALIGNED )
2825{
2826 v_uint8x64 br01, br23, ga01, ga23;
2827 v_zip(a, c, br01, br23);
2828 v_zip(b, d, ga01, ga23);
2829 v_uint8x64 bgra0, bgra1, bgra2, bgra3;
2830 v_zip(br01, ga01, bgra0, bgra1);
2831 v_zip(br23, ga23, bgra2, bgra3);
2832
2833 if( mode == hal::STORE_ALIGNED_NOCACHE )
2834 {
2835 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2836 _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
2837 _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
2838 _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
2839 }
2840 else if( mode == hal::STORE_ALIGNED )
2841 {
2842 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2843 _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
2844 _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
2845 _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
2846 }
2847 else
2848 {
2849 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2850 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
2851 _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
2852 _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
2853 }
2854}
2855
2856 inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
2857 const v_uint16x32& c, const v_uint16x32& d,
2858 hal::StoreMode mode=hal::STORE_UNALIGNED )
2859{
2860 v_uint16x32 br01, br23, ga01, ga23;
2861 v_zip(a, c, br01, br23);
2862 v_zip(b, d, ga01, ga23);
2863 v_uint16x32 bgra0, bgra1, bgra2, bgra3;
2864 v_zip(br01, ga01, bgra0, bgra1);
2865 v_zip(br23, ga23, bgra2, bgra3);
2866
2867 if( mode == hal::STORE_ALIGNED_NOCACHE )
2868 {
2869 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2870 _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
2871 _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
2872 _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
2873 }
2874 else if( mode == hal::STORE_ALIGNED )
2875 {
2876 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2877 _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
2878 _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
2879 _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
2880 }
2881 else
2882 {
2883 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2884 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
2885 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
2886 _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
2887 }
2888}
2889
2890 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
2891 const v_uint32x16& c, const v_uint32x16& d,
2892 hal::StoreMode mode=hal::STORE_UNALIGNED )
2893{
2894 v_uint32x16 br01, br23, ga01, ga23;
2895 v_zip(a, c, br01, br23);
2896 v_zip(b, d, ga01, ga23);
2897 v_uint32x16 bgra0, bgra1, bgra2, bgra3;
2898 v_zip(br01, ga01, bgra0, bgra1);
2899 v_zip(br23, ga23, bgra2, bgra3);
2900
2901 if( mode == hal::STORE_ALIGNED_NOCACHE )
2902 {
2903 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2904 _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
2905 _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
2906 _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
2907 }
2908 else if( mode == hal::STORE_ALIGNED )
2909 {
2910 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2911 _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
2912 _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
2913 _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
2914 }
2915 else
2916 {
2917 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2918 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
2919 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
2920 _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
2921 }
2922}
2923
2924 inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
2925 const v_uint64x8& c, const v_uint64x8& d,
2926 hal::StoreMode mode=hal::STORE_UNALIGNED )
2927{
2928 v_uint64x8 br01, br23, ga01, ga23;
2929 v_zip(a, c, br01, br23);
2930 v_zip(b, d, ga01, ga23);
2931 v_uint64x8 bgra0, bgra1, bgra2, bgra3;
2932 v_zip(br01, ga01, bgra0, bgra1);
2933 v_zip(br23, ga23, bgra2, bgra3);
2934
2935 if( mode == hal::STORE_ALIGNED_NOCACHE )
2936 {
2937 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2938 _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
2939 _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
2940 _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
2941 }
2942 else if( mode == hal::STORE_ALIGNED )
2943 {
2944 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2945 _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
2946 _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
2947 _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
2948 }
2949 else
2950 {
2951 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2952 _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
2953 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
2954 _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
2955 }
2956}
2957
2958 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2959 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2960 { \
2961 _Tpvec1 a1, b1; \
2962 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2963 a0 = v_reinterpret_as_##suffix0(a1); \
2964 b0 = v_reinterpret_as_##suffix0(b1); \
2965 } \
2966 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2967 { \
2968 _Tpvec1 a1, b1, c1; \
2969 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2970 a0 = v_reinterpret_as_##suffix0(a1); \
2971 b0 = v_reinterpret_as_##suffix0(b1); \
2972 c0 = v_reinterpret_as_##suffix0(c1); \
2973 } \
2974 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2975 { \
2976 _Tpvec1 a1, b1, c1, d1; \
2977 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2978 a0 = v_reinterpret_as_##suffix0(a1); \
2979 b0 = v_reinterpret_as_##suffix0(b1); \
2980 c0 = v_reinterpret_as_##suffix0(c1); \
2981 d0 = v_reinterpret_as_##suffix0(d1); \
2982 } \
2983 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2984 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2985 { \
2986 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2987 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2988 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2989 } \
2990 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2991 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2992 { \
2993 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2994 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2995 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2996 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2997 } \
2998 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2999 const _Tpvec0& c0, const _Tpvec0& d0, \
3000 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3001 { \
3002 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3003 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3004 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3005 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3006 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3007 }
3008
3009OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
3010OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
3011OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
3012OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
3013OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
3014OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
3015
3016
3017
3018
3019 inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
3020 inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3021 inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3022 inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3023
3024 inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
3025 inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
3026 inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3027 inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3028 inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3029 inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3030
3032 inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3033 inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
3034 inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3035 inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3036 inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3037 inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3038 inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3039 inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3040
3041 inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3042 inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3043 inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3044 inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3045 inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
3046 inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
3047 inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3048 inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3049 inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
3050 inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
3051 inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3052 inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3053
3054 inline int v_scan_forward(const v_int8x64& a)
3055{
3056 int64 mask = _mm512_movepi8_mask(a.val);
3057 int mask32 = (int)mask;
3058 return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
3059}
3060 inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
3061 inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3062 inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3063 inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3064 inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3065 inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3066 inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3067 inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3068 inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3069
3070 inline void v512_cleanup() { _mm256_zeroall(); }
3071
3072CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3073
3075
3076} // cv::
3077
3078 #endif // OPENCV_HAL_INTRIN_AVX_HPP
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition: intrin_cpp.hpp:1557
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition: intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition: intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75