5
#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
6
#define OPENCV_HAL_INTRIN_AVX512_HPP
8
#if defined(_MSC_VER) && (_MSC_VER < 1920
)
9
# pragma warning(disable:4146)
10
# pragma warning(disable:4309)
11
# pragma warning(disable:4310)
14
#define CVT_ROUND_MODES_IMPLEMENTED 0
17
#define CV_SIMD512_64F 1
18
#define CV_SIMD512_FP16 0
20
#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
21
#define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
22
_mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
23
((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
24
#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
25
a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
26
_v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
27
((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
28
((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
29
((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
30
#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
31
a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
32
a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
33
a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
34
_v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
35
((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
36
((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
37
((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
38
((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
39
((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
40
((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
41
((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
42
#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
43
a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
44
a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
45
a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
46
_v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
47
(uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
48
(uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
49
(uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
50
(uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
51
(uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
52
(uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
53
(uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
55
#ifndef _mm512_cvtpd_pslo
56
#ifdef _mm512_zextsi256_si512
57
#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
60
#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
68
inline
__m512i _v512_combine(
const
__m256i& lo,
const
__m256i& hi)
69{
return
_mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
71
inline
__m512 _v512_combine(
const
__m256& lo,
const
__m256& hi)
72{
return
_mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
74
inline
__m512d _v512_combine(
const
__m256d& lo,
const
__m256d& hi)
75{
return
_mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
77
inline
int
_v_cvtsi512_si32(
const
__m512i& a)
78{
return
_mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
80
inline
__m256i _v512_extract_high(
const
__m512i& v)
81{
return
_mm512_extracti32x8_epi32(v, 1); }
83
inline
__m256 _v512_extract_high(
const
__m512& v)
84{
return
_mm512_extractf32x8_ps(v, 1); }
86
inline
__m256d _v512_extract_high(
const
__m512d& v)
87{
return
_mm512_extractf64x4_pd(v, 1); }
89
inline
__m256i _v512_extract_low(
const
__m512i& v)
90{
return
_mm512_castsi512_si256(v); }
92
inline
__m256 _v512_extract_low(
const
__m512& v)
93{
return
_mm512_castps512_ps256(v); }
95
inline
__m256d _v512_extract_low(
const
__m512d& v)
96{
return
_mm512_castpd512_pd256(v); }
98
inline
__m512i _v512_insert(
const
__m512i& a,
const
__m256i& b)
99{
return
_mm512_inserti32x8(a, b, 0); }
101
inline
__m512 _v512_insert(
const
__m512& a,
const
__m256& b)
102{
return
_mm512_insertf32x8(a, b, 0); }
104
inline
__m512d _v512_insert(
const
__m512d& a,
const
__m256d& b)
105{
return
_mm512_insertf64x4(a, b, 0); }
114CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
120
typedef
uchar lane_type;
121
enum
{ nlanes = 64 };
124
explicit
v_uint8x64(__m512i v) : val(v) {}
125
v_uint8x64(uchar v0, uchar v1, uchar v2, uchar v3,
126
uchar v4, uchar v5, uchar v6, uchar v7,
127
uchar v8, uchar v9, uchar v10, uchar v11,
128
uchar v12, uchar v13, uchar v14, uchar v15,
129
uchar v16, uchar v17, uchar v18, uchar v19,
130
uchar v20, uchar v21, uchar v22, uchar v23,
131
uchar v24, uchar v25, uchar v26, uchar v27,
132
uchar v28, uchar v29, uchar v30, uchar v31,
133
uchar v32, uchar v33, uchar v34, uchar v35,
134
uchar v36, uchar v37, uchar v38, uchar v39,
135
uchar v40, uchar v41, uchar v42, uchar v43,
136
uchar v44, uchar v45, uchar v46, uchar v47,
137
uchar v48, uchar v49, uchar v50, uchar v51,
138
uchar v52, uchar v53, uchar v54, uchar v55,
139
uchar v56, uchar v57, uchar v58, uchar v59,
140
uchar v60, uchar v61, uchar v62, uchar v63)
142
val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
143
v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
144
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
145
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
149
static
inline
v_uint8x64 zero() {
return
v_uint8x64(_mm512_setzero_si512()); }
151
uchar get0()
const
{
return
(uchar)_v_cvtsi512_si32(val); }
156
typedef
schar lane_type;
157
enum
{ nlanes = 64 };
160
explicit
v_int8x64(__m512i v) : val(v) {}
161
v_int8x64(schar v0, schar v1, schar v2, schar v3,
162
schar v4, schar v5, schar v6, schar v7,
163
schar v8, schar v9, schar v10, schar v11,
164
schar v12, schar v13, schar v14, schar v15,
165
schar v16, schar v17, schar v18, schar v19,
166
schar v20, schar v21, schar v22, schar v23,
167
schar v24, schar v25, schar v26, schar v27,
168
schar v28, schar v29, schar v30, schar v31,
169
schar v32, schar v33, schar v34, schar v35,
170
schar v36, schar v37, schar v38, schar v39,
171
schar v40, schar v41, schar v42, schar v43,
172
schar v44, schar v45, schar v46, schar v47,
173
schar v48, schar v49, schar v50, schar v51,
174
schar v52, schar v53, schar v54, schar v55,
175
schar v56, schar v57, schar v58, schar v59,
176
schar v60, schar v61, schar v62, schar v63)
178
val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
179
v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
180
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
181
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
185
static
inline
v_int8x64 zero() {
return
v_int8x64(_mm512_setzero_si512()); }
187
schar get0()
const
{
return
(schar)_v_cvtsi512_si32(val); }
192
typedef
ushort lane_type;
193
enum
{ nlanes = 32 };
196
explicit
v_uint16x32(__m512i v) : val(v) {}
197
v_uint16x32(ushort v0, ushort v1, ushort v2, ushort v3,
198
ushort v4, ushort v5, ushort v6, ushort v7,
199
ushort v8, ushort v9, ushort v10, ushort v11,
200
ushort v12, ushort v13, ushort v14, ushort v15,
201
ushort v16, ushort v17, ushort v18, ushort v19,
202
ushort v20, ushort v21, ushort v22, ushort v23,
203
ushort v24, ushort v25, ushort v26, ushort v27,
204
ushort v28, ushort v29, ushort v30, ushort v31)
206
val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
207
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
211
static
inline
v_uint16x32 zero() {
return
v_uint16x32(_mm512_setzero_si512()); }
213
ushort get0()
const
{
return
(ushort)_v_cvtsi512_si32(val); }
218
typedef
short
lane_type;
219
enum
{ nlanes = 32 };
222
explicit
v_int16x32(__m512i v) : val(v) {}
223
v_int16x32(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7,
224
short
v8,
short
v9,
short
v10,
short
v11,
short
v12,
short
v13,
short
v14,
short
v15,
225
short
v16,
short
v17,
short
v18,
short
v19,
short
v20,
short
v21,
short
v22,
short
v23,
226
short
v24,
short
v25,
short
v26,
short
v27,
short
v28,
short
v29,
short
v30,
short
v31)
228
val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
229
(ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
230
(ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
231
(ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
235
static
inline
v_int16x32 zero() {
return
v_int16x32(_mm512_setzero_si512()); }
237
short
get0()
const
{
return
(
short)_v_cvtsi512_si32(val); }
242
typedef
unsigned
lane_type;
243
enum
{ nlanes = 16 };
246
explicit
v_uint32x16(__m512i v) : val(v) {}
247
v_uint32x16(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3,
248
unsigned
v4,
unsigned
v5,
unsigned
v6,
unsigned
v7,
249
unsigned
v8,
unsigned
v9,
unsigned
v10,
unsigned
v11,
250
unsigned
v12,
unsigned
v13,
unsigned
v14,
unsigned
v15)
252
val = _mm512_setr_epi32((
int)v0, (
int)v1, (
int)v2, (
int)v3, (
int)v4, (
int)v5, (
int)v6, (
int)v7,
253
(
int)v8, (
int)v9, (
int)v10, (
int)v11, (
int)v12, (
int)v13, (
int)v14, (
int)v15);
257
static
inline
v_uint32x16 zero() {
return
v_uint32x16(_mm512_setzero_si512()); }
259
unsigned
get0()
const
{
return
(
unsigned)_v_cvtsi512_si32(val); }
264
typedef
int
lane_type;
265
enum
{ nlanes = 16 };
268
explicit
v_int32x16(__m512i v) : val(v) {}
269
v_int32x16(
int
v0,
int
v1,
int
v2,
int
v3,
int
v4,
int
v5,
int
v6,
int
v7,
270
int
v8,
int
v9,
int
v10,
int
v11,
int
v12,
int
v13,
int
v14,
int
v15)
272
val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
276
static
inline
v_int32x16 zero() {
return
v_int32x16(_mm512_setzero_si512()); }
278
int
get0()
const
{
return
_v_cvtsi512_si32(val); }
283
typedef
float
lane_type;
284
enum
{ nlanes = 16 };
287
explicit
v_float32x16(__m512 v) : val(v) {}
288
v_float32x16(
float
v0,
float
v1,
float
v2,
float
v3,
float
v4,
float
v5,
float
v6,
float
v7,
289
float
v8,
float
v9,
float
v10,
float
v11,
float
v12,
float
v13,
float
v14,
float
v15)
291
val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
295
static
inline
v_float32x16 zero() {
return
v_float32x16(_mm512_setzero_ps()); }
297
float
get0()
const
{
return
_mm_cvtss_f32(_mm512_castps512_ps128(val)); }
302
typedef
uint64 lane_type;
306
explicit
v_uint64x8(__m512i v) : val(v) {}
307
v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
308
{ val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
311
static
inline
v_uint64x8 zero() {
return
v_uint64x8(_mm512_setzero_si512()); }
315
#if defined __x86_64__ || defined _M_X64
316
return
(uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
318
int
a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
319
int
b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
320
return
(
unsigned)a | ((uint64)(
unsigned)b << 32);
327
typedef
int64 lane_type;
331
explicit
v_int64x8(__m512i v) : val(v) {}
332
v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
333
{ val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
336
static
inline
v_int64x8 zero() {
return
v_int64x8(_mm512_setzero_si512()); }
340
#if defined __x86_64__ || defined _M_X64
341
return
(int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
343
int
a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
344
int
b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
345
return
(int64)((unsigned)a | ((uint64)(unsigned)b << 32));
352
typedef
double
lane_type;
356
explicit
v_float64x8(__m512d v) : val(v) {}
357
v_float64x8(
double
v0,
double
v1,
double
v2,
double
v3,
double
v4,
double
v5,
double
v6,
double
v7)
358
{ val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
361
static
inline
v_float64x8 zero() {
return
v_float64x8(_mm512_setzero_pd()); }
363
double
get0()
const
{
return
_mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
368
#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \
369
inline _Tpvec v512_load(const _Tp* ptr) \
370
{ return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \
371
inline _Tpvec v512_load_aligned(const _Tp* ptr) \
372
{ return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \
373
inline _Tpvec v512_load_low(const _Tp* ptr) \
375
__m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \
376
return _Tpvec(_mm512_castsi256_si512(v256)); \
378
inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
380
__m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \
381
__m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \
382
return _Tpvec(_v512_combine(vlo, vhi)); \
384
inline void v_store(_Tp* ptr, const _Tpvec& a) \
385
{ _mm512_storeu_si512((__m512i*)ptr, a.val); } \
386
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
387
{ _mm512_store_si512((__m512i*)ptr, a.val); } \
388
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
389
{ _mm512_stream_si512((__m512i*)ptr, a.val); } \
390
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
392
if( mode == hal::STORE_UNALIGNED ) \
393
_mm512_storeu_si512((__m512i*)ptr, a.val); \
394
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
395
_mm512_stream_si512((__m512i*)ptr, a.val); \
397
_mm512_store_si512((__m512i*)ptr, a.val); \
399
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
400
{ _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \
401
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
402
{ _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
404OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64, uchar)
405OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64, schar)
406OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
407OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,
short)
408OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,
unsigned)
409OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,
int)
410OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8, uint64)
411OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8, int64)
413
#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
414
inline _Tpvec v512_load(const _Tp* ptr) \
415
{ return _Tpvec(_mm512_loadu_##suffix(ptr)); } \
416
inline _Tpvec v512_load_aligned(const _Tp* ptr) \
417
{ return _Tpvec(_mm512_load_##suffix(ptr)); } \
418
inline _Tpvec v512_load_low(const _Tp* ptr) \
420
return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \
421
(_mm256_loadu_##suffix(ptr))); \
423
inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
425
halfreg vlo = _mm256_loadu_##suffix(ptr0); \
426
halfreg vhi = _mm256_loadu_##suffix(ptr1); \
427
return _Tpvec(_v512_combine(vlo, vhi)); \
429
inline void v_store(_Tp* ptr, const _Tpvec& a) \
430
{ _mm512_storeu_##suffix(ptr, a.val); } \
431
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
432
{ _mm512_store_##suffix(ptr, a.val); } \
433
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
434
{ _mm512_stream_##suffix(ptr, a.val); } \
435
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
437
if( mode == hal::STORE_UNALIGNED ) \
438
_mm512_storeu_##suffix(ptr, a.val); \
439
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
440
_mm512_stream_##suffix(ptr, a.val); \
442
_mm512_store_##suffix(ptr, a.val); \
444
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
445
{ _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \
446
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
447
{ _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
449OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16,
float, ps, __m256)
450OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8,
double, pd, __m256d)
452
#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
453
inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
454
{ return _Tpvec(cast(a.val)); }
456
#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
457
inline _Tpvec v512_setzero_##suffix() \
458
{ return _Tpvec(_mm512_setzero_si512()); } \
459
inline _Tpvec v512_setall_##suffix(_Tp v) \
460
{ return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
461
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
462
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
463
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
464
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \
465
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \
466
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \
467
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \
468
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \
469
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
470
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512)
472OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64, uchar, u8, epi8,
char)
473OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64, schar, s8, epi8,
char)
474OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort, u16, epi16,
short)
475OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,
short, s16, epi16,
short)
476OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16,
unsigned, u32, epi32,
int)
477OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,
int, s32, epi32,
int)
478OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8, uint64, u64, epi64, int64)
479OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
481
#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
482
inline _Tpvec v512_setzero_##suffix() \
483
{ return _Tpvec(_mm512_setzero_##zsuffix()); } \
484
inline _Tpvec v512_setall_##suffix(_Tp v) \
485
{ return _Tpvec(_mm512_set1_##zsuffix(v)); } \
486
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
487
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
488
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
489
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \
490
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \
491
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \
492
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \
493
OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast)
495OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16,
float, f32, ps, _mm512_castsi512_ps)
496OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,
double, f64, pd, _mm512_castsi512_pd)
498
inline
v_float32x16 v_reinterpret_as_f32(
const
v_float32x16& a)
500
inline
v_float32x16 v_reinterpret_as_f32(
const
v_float64x8& a)
501{
return
v_float32x16(_mm512_castpd_ps(a.val)); }
503
inline
v_float64x8 v_reinterpret_as_f64(
const
v_float64x8& a)
505
inline
v_float64x8 v_reinterpret_as_f64(
const
v_float32x16& a)
506{
return
v_float64x8(_mm512_castps_pd(a.val)); }
509
inline
v_float32x16 v512_load_expand(
const
float16_t* ptr)
511
return
v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((
const
__m256i*)ptr)));
514
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x16& a)
516
__m256i ah = _mm512_cvtps_ph(a.val, 0);
517
_mm256_storeu_si256((__m256i*)ptr, ah);
521
inline
void
v_zip(
const
v_int8x64& a,
const
v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
524
__m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24,
525
87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16,
526
79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8,
527
71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0);
528
ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
529
__m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
530
119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
531
111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
532
103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
533
ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
535
__m512i low = _mm512_unpacklo_epi8(a.val, b.val);
536
__m512i high = _mm512_unpackhi_epi8(a.val, b.val);
537
ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high));
538
ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
541
inline
void
v_zip(
const
v_int16x32& a,
const
v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
543
__m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8,
544
39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
545
ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
546
__m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
547
55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
548
ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
550
inline
void
v_zip(
const
v_int32x16& a,
const
v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
552
__m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
553
ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
554
__m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
555
ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
557
inline
void
v_zip(
const
v_int64x8& a,
const
v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
559
__m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0);
560
ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
561
__m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
562
ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
565
inline
void
v_zip(
const
v_uint8x64& a,
const
v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1)
568
v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
569
ab0 = v_reinterpret_as_u8(i0);
570
ab1 = v_reinterpret_as_u8(i1);
572
inline
void
v_zip(
const
v_uint16x32& a,
const
v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1)
575
v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
576
ab0 = v_reinterpret_as_u16(i0);
577
ab1 = v_reinterpret_as_u16(i1);
579
inline
void
v_zip(
const
v_uint32x16& a,
const
v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1)
582
v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
583
ab0 = v_reinterpret_as_u32(i0);
584
ab1 = v_reinterpret_as_u32(i1);
586
inline
void
v_zip(
const
v_uint64x8& a,
const
v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1)
589
v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
590
ab0 = v_reinterpret_as_u64(i0);
591
ab1 = v_reinterpret_as_u64(i1);
593
inline
void
v_zip(
const
v_float32x16& a,
const
v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1)
596
v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
597
ab0 = v_reinterpret_as_f32(i0);
598
ab1 = v_reinterpret_as_f32(i1);
600
inline
void
v_zip(
const
v_float64x8& a,
const
v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1)
603
v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
604
ab0 = v_reinterpret_as_f64(i0);
605
ab1 = v_reinterpret_as_f64(i1);
608
#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \
609
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
610
{ return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
611
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
612
{ return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \
613
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
614
_Tpvec& c, _Tpvec& d) \
616
c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \
617
d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \
621OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8)
622OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8)
623OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16)
624OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16)
625OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32)
626OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32)
627OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64)
628OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64)
629OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
630OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd)
637
#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
638
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
639
{ return _Tpvec(intrin(a.val, b.val)); }
641OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
642OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
643OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
644OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
645OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
646OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
647OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
648OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
649OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
650OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
652
inline
v_uint8x64 v_mul_wrap(
const
v_uint8x64& a,
const
v_uint8x64& b)
654
__m512i ad = _mm512_srai_epi16(a.val, 8);
655
__m512i bd = _mm512_srai_epi16(b.val, 8);
656
__m512i p0 = _mm512_mullo_epi16(a.val, b.val);
657
__m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8);
658
return
v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
660
inline
v_int8x64 v_mul_wrap(
const
v_int8x64& a,
const
v_int8x64& b)
662
return
v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
665
#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
666
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
667
{ return _Tpvec(intrin(a.val, b.val)); } \
668
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
669
{ a.val = intrin(a.val, b.val); return a; }
671OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
672OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
673OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
674OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
675OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
676OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
677OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
678OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
680OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
681OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
682OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
683OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
686OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
687OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
688OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
689OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
690OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
691OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
692OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
693OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
695OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
696OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
697OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
698OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
699OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
700OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
701OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
702OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
705
inline
v_uint8x64 operator * (
const
v_uint8x64& a,
const
v_uint8x64& b)
711
inline
v_int8x64 operator * (
const
v_int8x64& a,
const
v_int8x64& b)
717
inline
v_uint16x32 operator * (
const
v_uint16x32& a,
const
v_uint16x32& b)
719
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
720
__m512i ph = _mm512_mulhi_epu16(a.val, b.val);
721
__m512i p0 = _mm512_unpacklo_epi16(pl, ph);
722
__m512i p1 = _mm512_unpackhi_epi16(pl, ph);
724
const
__m512i m = _mm512_set1_epi32(65535);
725
return
v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
727
inline
v_int16x32 operator * (
const
v_int16x32& a,
const
v_int16x32& b)
729
__m512i pl = _mm512_mullo_epi16(a.val, b.val);
730
__m512i ph = _mm512_mulhi_epi16(a.val, b.val);
731
__m512i p0 = _mm512_unpacklo_epi16(pl, ph);
732
__m512i p1 = _mm512_unpackhi_epi16(pl, ph);
733
return
v_int16x32(_mm512_packs_epi32(p0, p1));
736
inline
v_uint8x64& operator *= (v_uint8x64& a,
const
v_uint8x64& b)
737{ a = a * b;
return
a; }
738
inline
v_int8x64& operator *= (v_int8x64& a,
const
v_int8x64& b)
739{ a = a * b;
return
a; }
740
inline
v_uint16x32& operator *= (v_uint16x32& a,
const
v_uint16x32& b)
741{ a = a * b;
return
a; }
742
inline
v_int16x32& operator *= (v_int16x32& a,
const
v_int16x32& b)
743{ a = a * b;
return
a; }
745
inline
v_int16x32
v_mul_hi(
const
v_int16x32& a,
const
v_int16x32& b) {
return
v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
746
inline
v_uint16x32
v_mul_hi(
const
v_uint16x32& a,
const
v_uint16x32& b) {
return
v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
749
inline
void
v_mul_expand(
const
v_uint8x64& a,
const
v_uint8x64& b,
750
v_uint16x32& c, v_uint16x32& d)
752
v_uint16x32 a0, a1, b0, b1;
755
c = v_mul_wrap(a0, b0);
756
d = v_mul_wrap(a1, b1);
759
inline
void
v_mul_expand(
const
v_int8x64& a,
const
v_int8x64& b,
760
v_int16x32& c, v_int16x32& d)
762
v_int16x32 a0, a1, b0, b1;
765
c = v_mul_wrap(a0, b0);
766
d = v_mul_wrap(a1, b1);
769
inline
void
v_mul_expand(
const
v_int16x32& a,
const
v_int16x32& b,
770
v_int32x16& c, v_int32x16& d)
775
c = v_reinterpret_as_s32(v0);
776
d = v_reinterpret_as_s32(v1);
779
inline
void
v_mul_expand(
const
v_uint16x32& a,
const
v_uint16x32& b,
780
v_uint32x16& c, v_uint32x16& d)
785
c = v_reinterpret_as_u32(v0);
786
d = v_reinterpret_as_u32(v1);
789
inline
void
v_mul_expand(
const
v_uint32x16& a,
const
v_uint32x16& b,
790
v_uint64x8& c, v_uint64x8& d)
792
v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
793
v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
796
inline
void
v_mul_expand(
const
v_int32x16& a,
const
v_int32x16& b,
797
v_int64x8& c, v_int64x8& d)
799
v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
800
v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
804
#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
805
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
806
{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
807
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
808
{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
809
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
810
{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
811
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
812
{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
814
inline _Tpuvec v_shl(const _Tpuvec& a) \
815
{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
817
inline _Tpsvec v_shl(const _Tpsvec& a) \
818
{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
820
inline _Tpuvec v_shr(const _Tpuvec& a) \
821
{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
823
inline _Tpsvec v_shr(const _Tpsvec& a) \
824
{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
826OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
827OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
828OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
832
#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
833
OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
834
OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
835
OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
836
inline _Tpvec operator ~ (const _Tpvec& a) \
837
{ return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
839OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
840OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1))
841OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1))
842OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1))
843OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1))
844OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1))
845OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1))
846OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1))
847OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
848OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
851
#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \
852
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
853
{ return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
855OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512)
856OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512)
857OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
858OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512)
859OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
860OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512)
861OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512)
862OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512)
863OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps)
864OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
867
#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
868
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
869
{ return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
871
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
872
OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
873
OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
874
OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
875
OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
876
OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
877
OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
879OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (
char)-1)
880OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (
char)-1)
881OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (
short)-1)
882OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (
short)-1)
883OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (
int)-1)
884OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (
int)-1)
885OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
886OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
888
#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
889
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
890
{ return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
892
#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
893
OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
894
OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
895
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
896
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
897
OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
898
OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
900OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (
int)-1)
901OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
903inline v_float32x16
v_not_nan(const v_float32x16& a)
904{
return
v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (
int)-1))); }
905
inline
v_float64x8
v_not_nan(
const
v_float64x8& a)
906{
return
v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
909OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8)
910OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8)
911OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8)
912OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8)
913OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16)
914OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16)
915OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16)
916OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16)
917OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32)
918OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32)
919OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32)
920OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32)
921OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64)
922OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64)
923OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64)
924OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64)
925OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
926OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
927OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd)
928OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd)
932
template<
bool
prec,
int
imm4,
bool
part,
int
imm32>
933
struct
_v_rotate_right {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64&) {
return
v_int8x64(); }};
934
template<
int
imm4,
int
imm32>
935
struct
_v_rotate_right<true, imm4, false, imm32> {
static
inline
v_int8x64 eval(
const
v_int8x64& a,
const
v_int8x64& b)
937
return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8),
938
_mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
941
struct
_v_rotate_right<true, imm4, false, 15> {
static
inline
v_int8x64 eval(
const
v_int8x64& a,
const
v_int8x64& b)
943
return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8),
944
_mm512_slli_epi32( b.val, (4-imm4)*8)));
946
template<
int
imm4,
int
imm32>
947
struct
_v_rotate_right<true, imm4, true, imm32> {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64& b)
949
return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8),
950
_mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
953
struct
_v_rotate_right<true, imm4, true, 31> {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64& b)
954
{
return
v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
956
struct
_v_rotate_right<false, 0, false, imm32> {
static
inline
v_int8x64 eval(
const
v_int8x64& a,
const
v_int8x64& b)
957
{
return
v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
959
struct
_v_rotate_right<false, 0, false, 0> {
static
inline
v_int8x64 eval(
const
v_int8x64& a,
const
v_int8x64&) {
return
a; }};
961
struct
_v_rotate_right<false, 0, true, imm32> {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64& b)
962
{
return
v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
964
struct
_v_rotate_right<false, 0, true, 16> {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64& b) {
return
b; }};
966
struct
_v_rotate_right<false, 0, true, 32> {
static
inline
v_int8x64 eval(
const
v_int8x64&,
const
v_int8x64&) {
return
v_int8x64(); }};
968
template<
int
imm>
inline
v_int8x64 v_rotate_right(
const
v_int8x64& a,
const
v_int8x64& b)
970
return
imm >= 128 ? v_int8x64() :
972
v_int8x64(_mm512_permutex2var_epi8(a.val,
973
_v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
974
0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
975
0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
976
0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
977
0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
978
0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
979
0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
980
0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
982
_v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
986
inline
v_int8x64 v_rotate_left(
const
v_int8x64& a,
const
v_int8x64& b)
988
if
(imm == 0)
return
a;
989
if
(imm == 64)
return
b;
990
if
(imm >= 128)
return
v_int8x64();
992
return
v_int8x64(_mm512_permutex2var_epi8(b.val,
993
_v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
994
0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
995
0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
996
0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
997
0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
998
0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
999
0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
1000
0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
1002
return
imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
1006
inline
v_int8x64 v_rotate_right(
const
v_int8x64& a)
1008
if
(imm == 0)
return
a;
1009
if
(imm >= 64)
return
v_int8x64();
1011
return
v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
1012
_v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
1013
0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
1014
0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
1015
0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
1016
0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
1017
0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
1018
0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
1019
0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
1021
return
v_rotate_right<imm>(a, v512_setzero_s8());
1025
inline
v_int8x64 v_rotate_left(
const
v_int8x64& a)
1027
if
(imm == 0)
return
a;
1028
if
(imm >= 64)
return
v_int8x64();
1030
return
v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
1031
_v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
1032
0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
1033
0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
1034
0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
1035
0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
1036
0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
1037
0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
1038
0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
1040
return
v_rotate_right<64 - imm>(v512_setzero_s8(), a);
1044
#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \
1045
template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1046
{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1047
template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1048
{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1049
template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1050
{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); } \
1051
template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1052
{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
1054
#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \
1056
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1058
enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1059
enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1060
if (imm == 0) return a; \
1061
if (imm == _Tpvec::nlanes) return b; \
1062
if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1063
return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
1066
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1068
enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1069
enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1070
if (imm == 0) return a; \
1071
if (imm == _Tpvec::nlanes) return b; \
1072
if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1073
return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
1076
inline _Tpvec v_rotate_left(const _Tpvec& a) \
1078
if (imm == 0) return a; \
1079
if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1080
return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1083
inline _Tpvec v_rotate_right(const _Tpvec& a) \
1085
if (imm == 0) return a; \
1086
if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1087
return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1090OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8)
1091OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16)
1092OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16)
1093OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32)
1094OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32)
1095OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64)
1096OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
1097OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
1098OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
1101
inline
v_uint8x64
v_reverse(
const
v_uint8x64 &a)
1104
static
const
__m512i perm = _mm512_set_epi32(
1105
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1106
0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
1107
0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
1108
0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1109
return
v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
1111
static
const
__m512i shuf = _mm512_set_epi32(
1112
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1113
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1114
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1115
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1116
static
const
__m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1117
__m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1118
return
v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
1122
inline
v_int8x64
v_reverse(
const
v_int8x64 &a)
1123{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1125
inline
v_uint16x32
v_reverse(
const
v_uint16x32 &a)
1128
static
const
__m512i perm = _mm512_set_epi32(
1129
0x00000001, 0x00020003, 0x00040005, 0x00060007,
1130
0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
1131
0x00100011, 0x00120013, 0x00140015, 0x00160017,
1132
0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
1133
return
v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
1135
static
const
__m512i shuf = _mm512_set_epi32(
1136
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1137
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1138
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1139
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
1140
static
const
__m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1141
__m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1142
return
v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
1146
inline
v_int16x32
v_reverse(
const
v_int16x32 &a)
1147{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1149
inline
v_uint32x16
v_reverse(
const
v_uint32x16 &a)
1151
static
const
__m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
1152
return
v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
1155
inline
v_int32x16
v_reverse(
const
v_int32x16 &a)
1156{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1158
inline
v_float32x16
v_reverse(
const
v_float32x16 &a)
1159{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1161
inline
v_uint64x8
v_reverse(
const
v_uint64x8 &a)
1163
static
const
__m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
1164
return
v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
1167
inline
v_int64x8
v_reverse(
const
v_int64x8 &a)
1168{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1170
inline
v_float64x8
v_reverse(
const
v_float64x8 &a)
1171{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1176
#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
1177
#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \
1178
inline sctype v_reduce_##func(const _Tpvec& a) \
1179
{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1180
sctype CV_DECL_ALIGNED(64) idx[2]; \
1181
_mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
1182
return scop(idx[0], idx[1]); }
1183OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64,
min, v_uint64x8, min_epu64,
min)
1184OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64,
max, v_uint64x8, max_epu64,
max)
1185OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1186OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,
min, v_int64x8, min_epi64,
min)
1187OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,
max, v_int64x8, max_epi64,
max)
1188OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1190
#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \
1191
inline double v_reduce_##func(const v_float64x8& a) \
1192
{ __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1193
double CV_DECL_ALIGNED(64) idx[2]; \
1194
_mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
1195
return scop(idx[0], idx[1]); }
1196OPENCV_HAL_IMPL_AVX512_REDUCE_8F(
min, min_pd,
min)
1197OPENCV_HAL_IMPL_AVX512_REDUCE_8F(
max, max_pd,
max)
1198OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1200
#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \
1201
inline sctype v_reduce_##func(const _Tpvec& a) \
1202
{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1203
__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1204
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1205
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1206
return (sctype)_mm_cvtsi128_si32(quarter); }
1207OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint,
min, v_uint32x16, min_epu32)
1208OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint,
max, v_uint32x16, max_epu32)
1209OPENCV_HAL_IMPL_AVX512_REDUCE_16(
int,
min, v_int32x16, min_epi32)
1210OPENCV_HAL_IMPL_AVX512_REDUCE_16(
int,
max, v_int32x16, max_epi32)
1212
#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \
1213
inline float v_reduce_##func(const v_float32x16& a) \
1214
{ __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1215
__m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
1216
quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \
1217
quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \
1218
return _mm_cvtss_f32(quarter); }
1219OPENCV_HAL_IMPL_AVX512_REDUCE_16F(
min, min_ps)
1220OPENCV_HAL_IMPL_AVX512_REDUCE_16F(
max, max_ps)
1224
__m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
1225
__m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
1226
quarter = _mm_hadd_ps(quarter, quarter);
1227
return
_mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
1231
__m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
1232
__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1233
quarter = _mm_hadd_epi32(quarter, quarter);
1234
return
_mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
1239
#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \
1240
inline sctype v_reduce_##func(const _Tpvec& a) \
1241
{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1242
__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1243
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1244
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1245
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1246
return (sctype)_mm_cvtsi128_si32(quarter); }
1247OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort,
min, v_uint16x32, min_epu16)
1248OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort,
max, v_uint16x32, max_epu16)
1249OPENCV_HAL_IMPL_AVX512_REDUCE_32(
short,
min, v_int16x32, min_epi16)
1250OPENCV_HAL_IMPL_AVX512_REDUCE_32(
short,
max, v_int16x32, max_epi16)
1257
#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
1258
inline sctype v_reduce_##func(const _Tpvec& a) \
1259
{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1260
__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1261
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1262
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1263
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1264
quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \
1265
return (sctype)_mm_cvtsi128_si32(quarter); }
1266OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar,
min, v_uint8x64, min_epu8)
1267OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar,
max, v_uint8x64, max_epu8)
1268OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar,
min, v_int8x64, min_epi8)
1269OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar,
max, v_int8x64, max_epi8)
1271
#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \
1272
inline sctype v_reduce_sum(const _Tpvec& a) \
1273
{ __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \
1274
_mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \
1275
a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
1276
__m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \
1277
__m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \
1278
a4 = _mm_hadd_epi32(a4, a4); \
1279
return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
1280OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
1281OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(
int, v_int8x64, epi8)
1283
inline
v_float32x16
v_reduce_sum4(
const
v_float32x16& a,
const
v_float32x16& b,
1284
const
v_float32x16& c,
const
v_float32x16& d)
1286
__m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
1287
__m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
1288
__m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
1289
__m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
1290
return
v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
1293
inline
unsigned
v_reduce_sad(
const
v_uint8x64& a,
const
v_uint8x64& b)
1295
__m512i val = _mm512_sad_epu8(a.val, b.val);
1296
__m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1297
__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1298
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1300
inline
unsigned
v_reduce_sad(
const
v_int8x64& a,
const
v_int8x64& b)
1302
__m512i val = _mm512_set1_epi8(-128);
1303
val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
1304
__m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1305
__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1306
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1308
inline
unsigned
v_reduce_sad(
const
v_uint16x32& a,
const
v_uint16x32& b)
1310
inline
unsigned
v_reduce_sad(
const
v_int16x32& a,
const
v_int16x32& b)
1311{
return
v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
1312
inline
unsigned
v_reduce_sad(
const
v_uint32x16& a,
const
v_uint32x16& b)
1314
inline
unsigned
v_reduce_sad(
const
v_int32x16& a,
const
v_int32x16& b)
1315{
return
v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
1316
inline
float
v_reduce_sad(
const
v_float32x16& a,
const
v_float32x16& b)
1317{
return
v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
1318
inline
double
v_reduce_sad(
const
v_float64x8& a,
const
v_float64x8& b)
1319{
return
v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
1322
inline
v_uint8x64
v_popcount(
const
v_int8x64& a)
1324
#if CV_AVX_512BITALG
1325
return
v_uint8x64(_mm512_popcnt_epi8(a.val));
1326
#elif CV_AVX_512VBMI
1327
__m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1328
5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1329
5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1330
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
1331
__m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1332
6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1333
6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1334
5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
1335
return
v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
1337
__m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
1338
__m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
1340
return
v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)),
1341
_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
1344
inline
v_uint16x32
v_popcount(
const
v_int16x32& a)
1346
#if CV_AVX_512BITALG
1347
return
v_uint16x32(_mm512_popcnt_epi16(a.val));
1348
#elif CV_AVX_512VPOPCNTDQ
1349
__m512i zero = _mm512_setzero_si512();
1350
return
v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
1351
_mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
1353
v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));
1354
p += v_rotate_right<1>(p);
1355
return
v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
1358
inline
v_uint32x16
v_popcount(
const
v_int32x16& a)
1360
#if CV_AVX_512VPOPCNTDQ
1361
return
v_uint32x16(_mm512_popcnt_epi32(a.val));
1363
v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));
1364
p += v_rotate_right<1>(p);
1365
p += v_rotate_right<2>(p);
1366
return
v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
1369
inline
v_uint64x8
v_popcount(
const
v_int64x8& a)
1371
#if CV_AVX_512VPOPCNTDQ
1372
return
v_uint64x8(_mm512_popcnt_epi64(a.val));
1374
return
v_uint64x8(_mm512_sad_epu8(
v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
1379
inline
v_uint8x64
v_popcount(
const
v_uint8x64& a) {
return
v_popcount(v_reinterpret_as_s8 (a)); }
1380
inline
v_uint16x32
v_popcount(
const
v_uint16x32& a) {
return
v_popcount(v_reinterpret_as_s16(a)); }
1381
inline
v_uint32x16
v_popcount(
const
v_uint32x16& a) {
return
v_popcount(v_reinterpret_as_s32(a)); }
1382
inline
v_uint64x8
v_popcount(
const
v_uint64x8& a) {
return
v_popcount(v_reinterpret_as_s64(a)); }
1388
#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1389
inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1390
{ return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1391
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1392
{ return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1393
inline _Tpvec v_sqrt(const _Tpvec& x) \
1394
{ return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
1395
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1396
{ return v_fma(a, a, b * b); } \
1397
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1398
{ return v_sqrt(v_fma(a, a, b * b)); }
1400OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
1401OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
1403
inline
v_int32x16
v_fma(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_int32x16& c)
1404{
return
a * b + c; }
1405
inline
v_int32x16
v_muladd(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_int32x16& c)
1406{
return
v_fma(a, b, c); }
1408
inline
v_float32x16
v_invsqrt(
const
v_float32x16& x)
1411
return
v_float32x16(_mm512_rsqrt28_ps(x.val));
1413
v_float32x16 half = x * v512_setall_f32(0.5);
1414
v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
1415
t *= v512_setall_f32(1.5) - ((t * t) * half);
1420
inline
v_float64x8
v_invsqrt(
const
v_float64x8& x)
1423
return
v_float64x8(_mm512_rsqrt28_pd(x.val));
1425
return
v512_setall_f64(1.) / v_sqrt(x);
1435
#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
1436
inline _Tpuvec v_abs(const _Tpvec& x) \
1437
{ return _Tpuvec(_mm512_abs_##suffix(x.val)); }
1439OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8)
1440OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16)
1441OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32)
1442OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64)
1444
inline
v_float32x16 v_abs(
const
v_float32x16& x)
1446
#ifdef _mm512_abs_pd
1447
return
v_float32x16(_mm512_abs_ps(x.val));
1449
return
v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
1450
_v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
1451
0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
1455
inline
v_float64x8 v_abs(
const
v_float64x8& x)
1457
#ifdef _mm512_abs_pd
1458
#if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
1460
return
v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
1462
return
v_float64x8(_mm512_abs_pd(x.val));
1465
return
v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
1466
_v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
1467
0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
1472
inline
v_uint8x64
v_absdiff(
const
v_uint8x64& a,
const
v_uint8x64& b)
1473{
return
v_add_wrap(a - b, b - a); }
1474
inline
v_uint16x32
v_absdiff(
const
v_uint16x32& a,
const
v_uint16x32& b)
1475{
return
v_add_wrap(a - b, b - a); }
1476
inline
v_uint32x16
v_absdiff(
const
v_uint32x16& a,
const
v_uint32x16& b)
1477{
return
v_max(a, b) - v_min(a, b); }
1479
inline
v_uint8x64
v_absdiff(
const
v_int8x64& a,
const
v_int8x64& b)
1481
v_int8x64 d = v_sub_wrap(a, b);
1482
v_int8x64 m = a < b;
1483
return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1486
inline
v_uint16x32
v_absdiff(
const
v_int16x32& a,
const
v_int16x32& b)
1487{
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1489
inline
v_uint32x16
v_absdiff(
const
v_int32x16& a,
const
v_int32x16& b)
1491
v_int32x16 d = a - b;
1492
v_int32x16 m = a < b;
1493
return
v_reinterpret_as_u32((d ^ m) - m);
1496
inline
v_float32x16
v_absdiff(
const
v_float32x16& a,
const
v_float32x16& b)
1497{
return
v_abs(a - b); }
1499
inline
v_float64x8
v_absdiff(
const
v_float64x8& a,
const
v_float64x8& b)
1500{
return
v_abs(a - b); }
1503
inline
v_int8x64
v_absdiffs(
const
v_int8x64& a,
const
v_int8x64& b)
1505
v_int8x64 d = a - b;
1506
v_int8x64 m = a < b;
1509
inline
v_int16x32
v_absdiffs(
const
v_int16x32& a,
const
v_int16x32& b)
1510{
return
v_max(a, b) - v_min(a, b); }
1515
inline
v_int32x16
v_round(
const
v_float32x16& a)
1516{
return
v_int32x16(_mm512_cvtps_epi32(a.val)); }
1518
inline
v_int32x16
v_round(
const
v_float64x8& a)
1519{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
1521
inline
v_int32x16
v_round(
const
v_float64x8& a,
const
v_float64x8& b)
1522{
return
v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
1524
inline
v_int32x16
v_trunc(
const
v_float32x16& a)
1525{
return
v_int32x16(_mm512_cvttps_epi32(a.val)); }
1527
inline
v_int32x16
v_trunc(
const
v_float64x8& a)
1528{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
1530
#if CVT_ROUND_MODES_IMPLEMENTED
1531
inline
v_int32x16
v_floor(
const
v_float32x16& a)
1532{
return
v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
1534
inline
v_int32x16
v_floor(
const
v_float64x8& a)
1535{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
1537
inline
v_int32x16
v_ceil(
const
v_float32x16& a)
1538{
return
v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
1540
inline
v_int32x16
v_ceil(
const
v_float64x8& a)
1541{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
1543
inline
v_int32x16
v_floor(
const
v_float32x16& a)
1544{
return
v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
1546
inline
v_int32x16
v_floor(
const
v_float64x8& a)
1547{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
1549
inline
v_int32x16
v_ceil(
const
v_float32x16& a)
1550{
return
v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
1552
inline
v_int32x16
v_ceil(
const
v_float64x8& a)
1553{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
1557
inline
v_float32x16
v_cvt_f32(
const
v_int32x16& a)
1558{
return
v_float32x16(_mm512_cvtepi32_ps(a.val)); }
1560
inline
v_float32x16
v_cvt_f32(
const
v_float64x8& a)
1561{
return
v_float32x16(_mm512_cvtpd_pslo(a.val)); }
1563
inline
v_float32x16
v_cvt_f32(
const
v_float64x8& a,
const
v_float64x8& b)
1564{
return
v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
1566
inline
v_float64x8
v_cvt_f64(
const
v_int32x16& a)
1567{
return
v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
1570{
return
v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
1572
inline
v_float64x8
v_cvt_f64(
const
v_float32x16& a)
1573{
return
v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
1576{
return
v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
1579
inline
v_float64x8
v_cvt_f64(
const
v_int64x8& v)
1582
return
v_float64x8(_mm512_cvtepi64_pd(v.val));
1585
__m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000);
1586
__m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000);
1587
__m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000);
1588
__m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
1591
__m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
1593
__m512i v_hi = _mm512_srli_epi64(v.val, 32);
1595
v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
1597
__m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
1599
__m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
1600
return
v_float64x8(result);
1606
inline
v_int8x64 v512_lut(
const
schar* tab,
const
int* idx)
1608
__m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx ), (
const
int
*)tab, 1));
1609
__m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx + 1), (
const
int
*)tab, 1));
1610
__m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx + 2), (
const
int
*)tab, 1));
1611
__m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx + 3), (
const
int
*)tab, 1));
1612
return
v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
1614
inline
v_int8x64 v512_lut_pairs(
const
schar* tab,
const
int* idx)
1616
__m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx ), (
const
int
*)tab, 1));
1617
__m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx + 1), (
const
int
*)tab, 1));
1618
return
v_int8x64(_v512_combine(p0, p1));
1620
inline
v_int8x64 v512_lut_quads(
const
schar* tab,
const
int* idx)
1622
return
v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx), (
const
int
*)tab, 1));
1624
inline
v_uint8x64 v512_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut((
const
schar *)tab, idx)); }
1625
inline
v_uint8x64 v512_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut_pairs((
const
schar *)tab, idx)); }
1626
inline
v_uint8x64 v512_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut_quads((
const
schar *)tab, idx)); }
1628
inline
v_int16x32 v512_lut(
const
short* tab,
const
int* idx)
1630
__m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx ), (
const
int
*)tab, 2));
1631
__m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx + 1), (
const
int
*)tab, 2));
1632
return
v_int16x32(_v512_combine(p0, p1));
1634
inline
v_int16x32 v512_lut_pairs(
const
short* tab,
const
int* idx)
1636
return
v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx), (
const
int
*)tab, 2));
1638
inline
v_int16x32 v512_lut_quads(
const
short* tab,
const
int* idx)
1640
#if defined(__GNUC__)
1641
return
v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), (
const
long
long
int*)tab, 2));
1643
return
v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), (
const
int64*)tab, 2));
1646
inline
v_uint16x32 v512_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut((
const
short
*)tab, idx)); }
1647
inline
v_uint16x32 v512_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut_pairs((
const
short
*)tab, idx)); }
1648
inline
v_uint16x32 v512_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut_quads((
const
short
*)tab, idx)); }
1650
inline
v_int32x16 v512_lut(
const
int* tab,
const
int* idx)
1652
return
v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const
__m512i*)idx), tab, 4));
1654
inline
v_int32x16 v512_lut_pairs(
const
int* tab,
const
int* idx)
1656
#if defined(__GNUC__)
1657
return
v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), (
const
long
long
int*)tab, 4));
1659
return
v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), (
const
int64*)tab, 4));
1662
inline
v_int32x16 v512_lut_quads(
const
int* tab,
const
int* idx)
1664
return
v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1665
_mm_loadu_si128((
const
__m128i*)(tab + idx[0]))),
1666
_mm_loadu_si128((
const
__m128i*)(tab + idx[1])), 1),
1667
_mm_loadu_si128((
const
__m128i*)(tab + idx[2])), 2),
1668
_mm_loadu_si128((
const
__m128i*)(tab + idx[3])), 3));
1670
inline
v_uint32x16 v512_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut((
const
int
*)tab, idx)); }
1671
inline
v_uint32x16 v512_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut_pairs((
const
int
*)tab, idx)); }
1672
inline
v_uint32x16 v512_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut_quads((
const
int
*)tab, idx)); }
1674
inline
v_int64x8 v512_lut(
const
int64* tab,
const
int* idx)
1676
#if defined(__GNUC__)
1677
return
v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), (
const
long
long
int*)tab, 8));
1679
return
v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((
const
__m256i*)idx), tab , 8));
1682
inline
v_int64x8 v512_lut_pairs(
const
int64* tab,
const
int* idx)
1684
return
v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1685
_mm_loadu_si128((
const
__m128i*)(tab + idx[0]))),
1686
_mm_loadu_si128((
const
__m128i*)(tab + idx[1])), 1),
1687
_mm_loadu_si128((
const
__m128i*)(tab + idx[2])), 2),
1688
_mm_loadu_si128((
const
__m128i*)(tab + idx[3])), 3));
1690
inline
v_uint64x8 v512_lut(
const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v512_lut((
const
int64 *)tab, idx)); }
1691
inline
v_uint64x8 v512_lut_pairs(
const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v512_lut_pairs((
const
int64 *)tab, idx)); }
1693
inline
v_float32x16 v512_lut(
const
float* tab,
const
int* idx)
1695
return
v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((
const
__m512i*)idx), tab, 4));
1697
inline
v_float32x16 v512_lut_pairs(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v512_lut_pairs((
const
int
*)tab, idx)); }
1698
inline
v_float32x16 v512_lut_quads(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v512_lut_quads((
const
int
*)tab, idx)); }
1700
inline
v_float64x8 v512_lut(
const
double* tab,
const
int* idx)
1702
return
v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((
const
__m256i*)idx), tab, 8));
1704
inline
v_float64x8 v512_lut_pairs(
const
double* tab,
const
int* idx)
1706
return
v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
1707
_mm_loadu_pd(tab + idx[0])),
1708
_mm_loadu_pd(tab + idx[1]), 1),
1709
_mm_loadu_pd(tab + idx[2]), 2),
1710
_mm_loadu_pd(tab + idx[3]), 3));
1713
inline
v_int32x16 v_lut(
const
int* tab,
const
v_int32x16& idxvec)
1715
return
v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
1718
inline
v_uint32x16 v_lut(
const
unsigned* tab,
const
v_int32x16& idxvec)
1720
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idxvec));
1723
inline
v_float32x16 v_lut(
const
float* tab,
const
v_int32x16& idxvec)
1725
return
v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
1728
inline
v_float64x8 v_lut(
const
double* tab,
const
v_int32x16& idxvec)
1730
return
v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
1733
inline
void
v_lut_deinterleave(
const
float* tab,
const
v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
1735
x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
1736
y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
1739
inline
void
v_lut_deinterleave(
const
double* tab,
const
v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
1741
x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
1742
y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
1745
inline
v_int8x64 v_interleave_pairs(
const
v_int8x64& vec)
1747
return
v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
1749
inline
v_uint8x64 v_interleave_pairs(
const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1750
inline
v_int8x64 v_interleave_quads(
const
v_int8x64& vec)
1752
return
v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
1754
inline
v_uint8x64 v_interleave_quads(
const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1756
inline
v_int16x32 v_interleave_pairs(
const
v_int16x32& vec)
1758
return
v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
1760
inline
v_uint16x32 v_interleave_pairs(
const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1761
inline
v_int16x32 v_interleave_quads(
const
v_int16x32& vec)
1763
return
v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
1765
inline
v_uint16x32 v_interleave_quads(
const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1767
inline
v_int32x16 v_interleave_pairs(
const
v_int32x16& vec)
1769
return
v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
1771
inline
v_uint32x16 v_interleave_pairs(
const
v_uint32x16& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1772
inline
v_float32x16 v_interleave_pairs(
const
v_float32x16& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1774
inline
v_int8x64 v_pack_triplets(
const
v_int8x64& vec)
1776
return
v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1777
0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
1778
_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
1780
inline
v_uint8x64 v_pack_triplets(
const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1782
inline
v_int16x32 v_pack_triplets(
const
v_int16x32& vec)
1784
return
v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
1785
0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
1787
inline
v_uint16x32 v_pack_triplets(
const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1789
inline
v_int32x16 v_pack_triplets(
const
v_int32x16& vec)
1791
return
v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1792
0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1794
inline
v_uint32x16 v_pack_triplets(
const
v_uint32x16& vec) {
return
v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1795
inline
v_float32x16 v_pack_triplets(
const
v_float32x16& vec)
1797
return
v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1798
0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1806
inline
v_int32x16
v_dotprod(
const
v_int16x32& a,
const
v_int16x32& b)
1807{
return
v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
1808
inline
v_int32x16
v_dotprod(
const
v_int16x32& a,
const
v_int16x32& b,
const
v_int32x16& c)
1812
inline
v_int64x8
v_dotprod(
const
v_int32x16& a,
const
v_int32x16& b)
1814
__m512i even = _mm512_mul_epi32(a.val, b.val);
1815
__m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
1816
return
v_int64x8(_mm512_add_epi64(even, odd));
1818
inline
v_int64x8
v_dotprod(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_int64x8& c)
1822
inline
v_uint32x16
v_dotprod_expand(
const
v_uint8x64& a,
const
v_uint8x64& b)
1824
__m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
1825
__m512i odd_a = _mm512_srli_epi16(a.val, 8);
1827
__m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
1828
__m512i odd_b = _mm512_srli_epi16(b.val, 8);
1830
__m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1831
__m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1832
return
v_uint32x16(_mm512_add_epi32(prod0, prod1));
1834
inline
v_uint32x16
v_dotprod_expand(
const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint32x16& c)
1839
__m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
1840
__m512i odd_a = _mm512_srai_epi16(a.val, 8);
1842
__m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
1843
__m512i odd_b = _mm512_srai_epi16(b.val, 8);
1845
__m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1846
__m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1847
return
v_int32x16(_mm512_add_epi32(prod0, prod1));
1849
inline
v_int32x16
v_dotprod_expand(
const
v_int8x64& a,
const
v_int8x64& b,
const
v_int32x16& c)
1853
inline
v_uint64x8
v_dotprod_expand(
const
v_uint16x32& a,
const
v_uint16x32& b)
1855
__m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1856
__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1857
__m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1858
__m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1860
__m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1861
__m512i p13 = _mm512_srli_epi64(mul0, 32);
1862
__m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1863
__m512i p57 = _mm512_srli_epi64(mul1, 32);
1865
__m512i p15_ = _mm512_add_epi64(p02, p13);
1866
__m512i p9d_ = _mm512_add_epi64(p46, p57);
1868
return
v_uint64x8(_mm512_add_epi64(
1869
_mm512_unpacklo_epi64(p15_, p9d_),
1870
_mm512_unpackhi_epi64(p15_, p9d_)
1873
inline
v_uint64x8
v_dotprod_expand(
const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint64x8& c)
1876
inline
v_int64x8
v_dotprod_expand(
const
v_int16x32& a,
const
v_int16x32& b)
1878
__m512i prod = _mm512_madd_epi16(a.val, b.val);
1879
__m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
1880
__m512i odd = _mm512_srai_epi64(prod, 32);
1881
return
v_int64x8(_mm512_add_epi64(even, odd));
1883
inline
v_int64x8
v_dotprod_expand(
const
v_int16x32& a,
const
v_int16x32& b,
const
v_int64x8& c)
1887
inline
v_float64x8
v_dotprod_expand(
const
v_int32x16& a,
const
v_int32x16& b)
1889
inline
v_float64x8
v_dotprod_expand(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_float64x8& c)
1895
inline
v_int32x16
v_dotprod_fast(
const
v_int16x32& a,
const
v_int16x32& b)
1897
inline
v_int32x16
v_dotprod_fast(
const
v_int16x32& a,
const
v_int16x32& b,
const
v_int32x16& c)
1901
inline
v_int64x8
v_dotprod_fast(
const
v_int32x16& a,
const
v_int32x16& b)
1903
inline
v_int64x8
v_dotprod_fast(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_int64x8& c)
1909
inline
v_uint32x16
v_dotprod_expand_fast(
const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint32x16& c)
1914
inline
v_int32x16
v_dotprod_expand_fast(
const
v_int8x64& a,
const
v_int8x64& b,
const
v_int32x16& c)
1920
__m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1921
__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1922
__m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1923
__m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1925
__m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1926
__m512i p13 = _mm512_srli_epi64(mul0, 32);
1927
__m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1928
__m512i p57 = _mm512_srli_epi64(mul1, 32);
1930
__m512i p15_ = _mm512_add_epi64(p02, p13);
1931
__m512i p9d_ = _mm512_add_epi64(p46, p57);
1932
return
v_uint64x8(_mm512_add_epi64(p15_, p9d_));
1934
inline
v_uint64x8
v_dotprod_expand_fast(
const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint64x8& c)
1939
inline
v_int64x8
v_dotprod_expand_fast(
const
v_int16x32& a,
const
v_int16x32& b,
const
v_int64x8& c)
1945
inline
v_float64x8
v_dotprod_expand_fast(
const
v_int32x16& a,
const
v_int32x16& b,
const
v_float64x8& c)
1949
#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
1950
v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1952
inline
v_float32x16
v_matmul(
const
v_float32x16& v,
1953
const
v_float32x16& m0,
const
v_float32x16& m1,
1954
const
v_float32x16& m2,
const
v_float32x16& m3)
1956
v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1957
v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1958
v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1959
v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
1963
inline
v_float32x16
v_matmuladd(
const
v_float32x16& v,
1964
const
v_float32x16& m0,
const
v_float32x16& m1,
1965
const
v_float32x16& m2,
const
v_float32x16& a)
1967
v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1968
v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1969
v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1973
#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1974
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1975
const _Tpvec& a2, const _Tpvec& a3, \
1976
_Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1978
__m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \
1979
__m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \
1980
__m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \
1981
__m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \
1982
b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \
1983
b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \
1984
b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \
1985
b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \
1988OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1989OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1990OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
1995
#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1996
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1998
b0.val = intrin(_v512_extract_low(a.val)); \
1999
b1.val = intrin(_v512_extract_high(a.val)); \
2001
inline _Tpwvec v_expand_low(const _Tpvec& a) \
2002
{ return _Tpwvec(intrin(_v512_extract_low(a.val))); } \
2003
inline _Tpwvec v_expand_high(const _Tpvec& a) \
2004
{ return _Tpwvec(intrin(_v512_extract_high(a.val))); } \
2005
inline _Tpwvec v512_load_expand(const _Tp* ptr) \
2007
__m256i a = _mm256_loadu_si256((const __m256i*)ptr); \
2008
return _Tpwvec(intrin(a)); \
2011OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32, uchar, _mm512_cvtepu8_epi16)
2012OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32, schar, _mm512_cvtepi8_epi16)
2013OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort, _mm512_cvtepu16_epi32)
2014OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16,
short, _mm512_cvtepi16_epi32)
2015OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,
unsigned, _mm512_cvtepu32_epi64)
2016OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8,
int, _mm512_cvtepi32_epi64)
2018
#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
2019
inline _Tpvec v512_load_expand_q(const _Tp* ptr) \
2021
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2022
return _Tpvec(intrin(a)); \
2025OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
2026OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32)
2030
inline
v_int8x64 v_pack(
const
v_int16x32& a,
const
v_int16x32& b)
2031{
return
v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2033
inline
v_uint8x64 v_pack(
const
v_uint16x32& a,
const
v_uint16x32& b)
2035
const
__m512i t = _mm512_set1_epi16(255);
2036
return
v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
2039
inline
v_uint8x64 v_pack_u(
const
v_int16x32& a,
const
v_int16x32& b)
2041
return
v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
2044
inline
void
v_pack_store(schar* ptr,
const
v_int16x32& a)
2047
inline
void
v_pack_store(uchar* ptr,
const
v_uint16x32& a)
2049
const
__m512i m = _mm512_set1_epi16(255);
2050
_mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
2053
inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x32& a)
2056
template<
int
n>
inline
2057v_uint8x64 v_rshr_pack(
const
v_uint16x32& a,
const
v_uint16x32& b)
2060
v_uint16x32 delta = v512_setall_u16((
short)(1 << (n-1)));
2061
return
v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2062
v_reinterpret_as_s16((b + delta) >> n));
2065
template<
int
n>
inline
2066
void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x32& a)
2068
v_uint16x32 delta = v512_setall_u16((
short)(1 << (n-1)));
2069
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2072
template<
int
n>
inline
2073v_uint8x64 v_rshr_pack_u(
const
v_int16x32& a,
const
v_int16x32& b)
2075
v_int16x32 delta = v512_setall_s16((
short)(1 << (n-1)));
2076
return
v_pack_u((a + delta) >> n, (b + delta) >> n);
2079
template<
int
n>
inline
2080
void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x32& a)
2082
v_int16x32 delta = v512_setall_s16((
short)(1 << (n-1)));
2083
v_pack_u_store(ptr, (a + delta) >> n);
2086
template<
int
n>
inline
2087v_int8x64 v_rshr_pack(
const
v_int16x32& a,
const
v_int16x32& b)
2089
v_int16x32 delta = v512_setall_s16((
short)(1 << (n-1)));
2090
return
v_pack((a + delta) >> n, (b + delta) >> n);
2093
template<
int
n>
inline
2094
void
v_rshr_pack_store(schar* ptr,
const
v_int16x32& a)
2096
v_int16x32 delta = v512_setall_s16((
short)(1 << (n-1)));
2097
v_pack_store(ptr, (a + delta) >> n);
2101
inline
v_int16x32 v_pack(
const
v_int32x16& a,
const
v_int32x16& b)
2102{
return
v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
2104
inline
v_uint16x32 v_pack(
const
v_uint32x16& a,
const
v_uint32x16& b)
2106
const
__m512i m = _mm512_set1_epi32(65535);
2107
return
v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
2110
inline
v_uint16x32 v_pack_u(
const
v_int32x16& a,
const
v_int32x16& b)
2111{
return
v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
2113
inline
void
v_pack_store(
short* ptr,
const
v_int32x16& a)
2116
inline
void
v_pack_store(ushort* ptr,
const
v_uint32x16& a)
2118
const
__m512i m = _mm512_set1_epi32(65535);
2119
_mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
2122
inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x16& a)
2126
template<
int
n>
inline
2127v_uint16x32 v_rshr_pack(
const
v_uint32x16& a,
const
v_uint32x16& b)
2129
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2130
return
v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2131
v_reinterpret_as_s32((b + delta) >> n));
2134
template<
int
n>
inline
2135
void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x16& a)
2137
v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2138
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2141
template<
int
n>
inline
2142v_uint16x32 v_rshr_pack_u(
const
v_int32x16& a,
const
v_int32x16& b)
2144
v_int32x16 delta = v512_setall_s32(1 << (n-1));
2145
return
v_pack_u((a + delta) >> n, (b + delta) >> n);
2148
template<
int
n>
inline
2149
void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x16& a)
2151
v_int32x16 delta = v512_setall_s32(1 << (n-1));
2152
v_pack_u_store(ptr, (a + delta) >> n);
2155
template<
int
n>
inline
2156v_int16x32 v_rshr_pack(
const
v_int32x16& a,
const
v_int32x16& b)
2158
v_int32x16 delta = v512_setall_s32(1 << (n-1));
2159
return
v_pack((a + delta) >> n, (b + delta) >> n);
2162
template<
int
n>
inline
2163
void
v_rshr_pack_store(
short* ptr,
const
v_int32x16& a)
2165
v_int32x16 delta = v512_setall_s32(1 << (n-1));
2166
v_pack_store(ptr, (a + delta) >> n);
2171
inline
v_uint32x16 v_pack(
const
v_uint64x8& a,
const
v_uint64x8& b)
2172{
return
v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
2174
inline
v_int32x16 v_pack(
const
v_int64x8& a,
const
v_int64x8& b)
2175{
return
v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2177
inline
void
v_pack_store(
unsigned* ptr,
const
v_uint64x8& a)
2178{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
2180
inline
void
v_pack_store(
int* ptr,
const
v_int64x8& b)
2181{ v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2183
template<
int
n>
inline
2184v_uint32x16 v_rshr_pack(
const
v_uint64x8& a,
const
v_uint64x8& b)
2186
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2187
return
v_pack((a + delta) >> n, (b + delta) >> n);
2190
template<
int
n>
inline
2191
void
v_rshr_pack_store(
unsigned* ptr,
const
v_uint64x8& a)
2193
v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2194
v_pack_store(ptr, (a + delta) >> n);
2197
template<
int
n>
inline
2198v_int32x16 v_rshr_pack(
const
v_int64x8& a,
const
v_int64x8& b)
2200
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2201
return
v_pack((a + delta) >> n, (b + delta) >> n);
2204
template<
int
n>
inline
2205
void
v_rshr_pack_store(
int* ptr,
const
v_int64x8& a)
2207
v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2208
v_pack_store(ptr, (a + delta) >> n);
2212
inline
v_uint8x64
v_pack_b(
const
v_uint16x32& a,
const
v_uint16x32& b)
2213{
return
v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2215
inline
v_uint8x64
v_pack_b(
const
v_uint32x16& a,
const
v_uint32x16& b,
2216
const
v_uint32x16& c,
const
v_uint32x16& d)
2218
__m512i ab = _mm512_packs_epi32(a.val, b.val);
2219
__m512i cd = _mm512_packs_epi32(c.val, d.val);
2221
return
v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
2224
inline
v_uint8x64
v_pack_b(
const
v_uint64x8& a,
const
v_uint64x8& b,
const
v_uint64x8& c,
2225
const
v_uint64x8& d,
const
v_uint64x8& e,
const
v_uint64x8& f,
2226
const
v_uint64x8& g,
const
v_uint64x8& h)
2228
__m512i ab = _mm512_packs_epi32(a.val, b.val);
2229
__m512i cd = _mm512_packs_epi32(c.val, d.val);
2230
__m512i ef = _mm512_packs_epi32(e.val, f.val);
2231
__m512i gh = _mm512_packs_epi32(g.val, h.val);
2233
__m512i abcd = _mm512_packs_epi32(ab, cd);
2234
__m512i efgh = _mm512_packs_epi32(ef, gh);
2236
return
v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
2237
27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh)));
2244
#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \
2246
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2247
{ return v_rotate_right<s>(a, b); }
2249OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
2250OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
2251OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
2252OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
2253OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
2254OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
2255OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
2256OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
2257OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
2258OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
2260
#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
2261
template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
2263OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
2264OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
2265OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
2266OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32,
short)
2267OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
2268OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16,
int)
2269OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
2270OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
2271OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16,
float)
2272OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8,
double)
2277
static
const
__m512i perm = _mm512_set1_epi32((
char)i);
2278
return
v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
2283{
return
v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2287{
return
v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2294
__m512i ab0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2295
__m512i ab1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 64));
2297
__m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2298
94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2299
62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2300
30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2301
__m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2302
95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2303
63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2304
31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2305
a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
2306
b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
2308
__m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
2309
__m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
2310
__m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
2311
__m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2312
__m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2313
a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
2314
b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
2320
__m512i ab0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2321
__m512i ab1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 32));
2322
__m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2323
30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2324
__m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2325
31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2326
a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
2327
b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
2332
__m512i ab0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2333
__m512i ab1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 16));
2334
__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2335
__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2336
a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
2337
b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
2342
__m512i ab0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2343
__m512i ab1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 8));
2344
__m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2345
__m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2346
a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
2347
b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
2350
inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
2352
__m512i bgr0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2353
__m512i bgr1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 64));
2354
__m512i bgr2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 128));
2357
__m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81,
2358
78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33,
2359
30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50,
2360
47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
2361
__m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
2362
__m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
2363
__m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
2364
_v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2365
77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
2366
94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49,
2367
46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2);
2368
a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
2369
b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
2370
c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
2371
#elif CV_AVX_512VBMI
2372
__m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
2373
__m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
2374
__m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
2375
a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2376
77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48,
2377
46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
2378
23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2));
2379
b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40,
2380
39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17,
2381
15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114,
2382
111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0));
2383
c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18,
2384
15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98,
2385
95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50,
2386
47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1));
2388
__m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2389
45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2390
__m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2391
__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2392
__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2394
__m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
2395
__m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2396
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
2397
__m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
2398
a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
2399
c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
2400
b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
2404
inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
2406
__m512i bgr0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2407
__m512i bgr1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 32));
2408
__m512i bgr2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 64));
2410
__m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2411
45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2412
__m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2413
__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2414
__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2416
a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
2417
b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2418
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
2419
c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
2422
inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
2424
__m512i bgr0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2425
__m512i bgr1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 16));
2426
__m512i bgr2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 32));
2428
__m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2429
__m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
2430
__m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
2431
__m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
2433
a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
2434
b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
2435
c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
2438
inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
2440
__m512i bgr0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2441
__m512i bgr1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 8));
2442
__m512i bgr2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 16));
2444
__m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
2445
__m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
2446
__m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
2447
__m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
2449
a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
2450
c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
2451
b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
2454
inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
2456
__m512i bgra0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2457
__m512i bgra1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 64));
2458
__m512i bgra2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 128));
2459
__m512i bgra3 = _mm512_loadu_si512((
const
__m512i*)(ptr + 192));
2462
__m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2463
94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2464
62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2465
30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2466
__m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2467
95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2468
63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2469
31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2471
__m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
2472
__m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
2473
__m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
2474
__m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
2476
a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
2477
c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
2478
b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
2479
d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
2481
__m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
2482
__m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
2483
__m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
2484
__m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
2485
__m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
2487
__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2488
__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2490
__m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
2491
__m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
2492
__m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
2493
__m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
2495
a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
2496
c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
2497
b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2498
d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2502
inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
2504
__m512i bgra0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2505
__m512i bgra1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 32));
2506
__m512i bgra2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 64));
2507
__m512i bgra3 = _mm512_loadu_si512((
const
__m512i*)(ptr + 96));
2509
__m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2510
30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2511
__m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2512
31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2514
__m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
2515
__m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
2516
__m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
2517
__m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
2519
a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
2520
c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
2521
b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
2522
d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
2525
inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
2527
__m512i bgra0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2528
__m512i bgra1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 16));
2529
__m512i bgra2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 32));
2530
__m512i bgra3 = _mm512_loadu_si512((
const
__m512i*)(ptr + 48));
2532
__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2533
__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2535
__m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
2536
__m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
2537
__m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
2538
__m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
2540
a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
2541
c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
2542
b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2543
d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2546
inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
2548
__m512i bgra0 = _mm512_loadu_si512((
const
__m512i*)ptr);
2549
__m512i bgra1 = _mm512_loadu_si512((
const
__m512i*)(ptr + 8));
2550
__m512i bgra2 = _mm512_loadu_si512((
const
__m512i*)(ptr + 16));
2551
__m512i bgra3 = _mm512_loadu_si512((
const
__m512i*)(ptr + 24));
2553
__m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2554
__m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2556
__m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
2557
__m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
2558
__m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
2559
__m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
2561
a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
2562
c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
2563
b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
2564
d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
2569
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& x,
const
v_uint8x64& y,
2570
hal::StoreMode mode=hal::STORE_UNALIGNED )
2572
v_uint8x64 low, high;
2573
v_zip(x, y, low, high);
2574
if( mode == hal::STORE_ALIGNED_NOCACHE )
2576
_mm512_stream_si512((__m512i*)ptr, low.val);
2577
_mm512_stream_si512((__m512i*)(ptr + 64), high.val);
2579
else
if( mode == hal::STORE_ALIGNED )
2581
_mm512_store_si512((__m512i*)ptr, low.val);
2582
_mm512_store_si512((__m512i*)(ptr + 64), high.val);
2586
_mm512_storeu_si512((__m512i*)ptr, low.val);
2587
_mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
2591
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& x,
const
v_uint16x32& y,
2592
hal::StoreMode mode=hal::STORE_UNALIGNED )
2594
v_uint16x32 low, high;
2595
v_zip(x, y, low, high);
2596
if( mode == hal::STORE_ALIGNED_NOCACHE )
2598
_mm512_stream_si512((__m512i*)ptr, low.val);
2599
_mm512_stream_si512((__m512i*)(ptr + 32), high.val);
2601
else
if( mode == hal::STORE_ALIGNED )
2603
_mm512_store_si512((__m512i*)ptr, low.val);
2604
_mm512_store_si512((__m512i*)(ptr + 32), high.val);
2608
_mm512_storeu_si512((__m512i*)ptr, low.val);
2609
_mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
2613
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& x,
const
v_uint32x16& y,
2614
hal::StoreMode mode=hal::STORE_UNALIGNED )
2616
v_uint32x16 low, high;
2617
v_zip(x, y, low, high);
2618
if( mode == hal::STORE_ALIGNED_NOCACHE )
2620
_mm512_stream_si512((__m512i*)ptr, low.val);
2621
_mm512_stream_si512((__m512i*)(ptr + 16), high.val);
2623
else
if( mode == hal::STORE_ALIGNED )
2625
_mm512_store_si512((__m512i*)ptr, low.val);
2626
_mm512_store_si512((__m512i*)(ptr + 16), high.val);
2630
_mm512_storeu_si512((__m512i*)ptr, low.val);
2631
_mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
2635
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& x,
const
v_uint64x8& y,
2636
hal::StoreMode mode=hal::STORE_UNALIGNED )
2638
v_uint64x8 low, high;
2639
v_zip(x, y, low, high);
2640
if( mode == hal::STORE_ALIGNED_NOCACHE )
2642
_mm512_stream_si512((__m512i*)ptr, low.val);
2643
_mm512_stream_si512((__m512i*)(ptr + 8), high.val);
2645
else
if( mode == hal::STORE_ALIGNED )
2647
_mm512_store_si512((__m512i*)ptr, low.val);
2648
_mm512_store_si512((__m512i*)(ptr + 8), high.val);
2652
_mm512_storeu_si512((__m512i*)ptr, low.val);
2653
_mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
2657
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint8x64& c,
2658
hal::StoreMode mode=hal::STORE_UNALIGNED )
2661
__m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122,
2662
79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74,
2663
10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5,
2664
111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106);
2665
__m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16,
2666
37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32,
2667
95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90,
2668
5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0);
2669
__m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101,
2670
122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117,
2671
53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48,
2672
90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85);
2673
__m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
2674
__m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
2675
__m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
2677
__m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
2678
__m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
2679
__m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
2681
__m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
2682
__m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
2683
__m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
2684
__m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
2686
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2687
5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2688
__m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2689
47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2690
__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2691
26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2692
__m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
2693
__m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
2694
__m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
2696
__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2697
__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2698
__m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2701
if( mode == hal::STORE_ALIGNED_NOCACHE )
2703
_mm512_stream_si512((__m512i*)ptr, bgr0);
2704
_mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
2705
_mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
2707
else
if( mode == hal::STORE_ALIGNED )
2709
_mm512_store_si512((__m512i*)ptr, bgr0);
2710
_mm512_store_si512((__m512i*)(ptr + 64), bgr1);
2711
_mm512_store_si512((__m512i*)(ptr + 128), bgr2);
2715
_mm512_storeu_si512((__m512i*)ptr, bgr0);
2716
_mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
2717
_mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
2721
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint16x32& c,
2722
hal::StoreMode mode=hal::STORE_UNALIGNED )
2724
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2725
5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2726
__m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2727
47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2728
__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2729
26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2730
__m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
2731
__m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
2732
__m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
2734
__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2735
__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2736
__m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2738
if( mode == hal::STORE_ALIGNED_NOCACHE )
2740
_mm512_stream_si512((__m512i*)ptr, bgr0);
2741
_mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
2742
_mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
2744
else
if( mode == hal::STORE_ALIGNED )
2746
_mm512_store_si512((__m512i*)ptr, bgr0);
2747
_mm512_store_si512((__m512i*)(ptr + 32), bgr1);
2748
_mm512_store_si512((__m512i*)(ptr + 64), bgr2);
2752
_mm512_storeu_si512((__m512i*)ptr, bgr0);
2753
_mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
2754
_mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
2758
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& a,
const
v_uint32x16& b,
const
v_uint32x16& c,
2759
hal::StoreMode mode=hal::STORE_UNALIGNED )
2761
__m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
2762
__m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26);
2763
__m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
2764
__m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
2766
__m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
2767
__m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
2768
__m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
2770
if( mode == hal::STORE_ALIGNED_NOCACHE )
2772
_mm512_stream_si512((__m512i*)ptr, bgr0);
2773
_mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
2774
_mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
2776
else
if( mode == hal::STORE_ALIGNED )
2778
_mm512_store_si512((__m512i*)ptr, bgr0);
2779
_mm512_store_si512((__m512i*)(ptr + 16), bgr1);
2780
_mm512_store_si512((__m512i*)(ptr + 32), bgr2);
2784
_mm512_storeu_si512((__m512i*)ptr, bgr0);
2785
_mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
2786
_mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
2790
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& a,
const
v_uint64x8& b,
const
v_uint64x8& c,
2791
hal::StoreMode mode=hal::STORE_UNALIGNED )
2793
__m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10);
2794
__m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5);
2795
__m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
2796
__m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
2798
__m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
2799
__m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
2800
__m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
2802
if( mode == hal::STORE_ALIGNED_NOCACHE )
2804
_mm512_stream_si512((__m512i*)ptr, bgr0);
2805
_mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
2806
_mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
2808
else
if( mode == hal::STORE_ALIGNED )
2810
_mm512_store_si512((__m512i*)ptr, bgr0);
2811
_mm512_store_si512((__m512i*)(ptr + 8), bgr1);
2812
_mm512_store_si512((__m512i*)(ptr + 16), bgr2);
2816
_mm512_storeu_si512((__m512i*)ptr, bgr0);
2817
_mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
2818
_mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
2822
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& a,
const
v_uint8x64& b,
2823
const
v_uint8x64& c,
const
v_uint8x64& d,
2824
hal::StoreMode mode=hal::STORE_UNALIGNED )
2826
v_uint8x64 br01, br23, ga01, ga23;
2827
v_zip(a, c, br01, br23);
2828
v_zip(b, d, ga01, ga23);
2829
v_uint8x64 bgra0, bgra1, bgra2, bgra3;
2830
v_zip(br01, ga01, bgra0, bgra1);
2831
v_zip(br23, ga23, bgra2, bgra3);
2833
if( mode == hal::STORE_ALIGNED_NOCACHE )
2835
_mm512_stream_si512((__m512i*)ptr, bgra0.val);
2836
_mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
2837
_mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
2838
_mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
2840
else
if( mode == hal::STORE_ALIGNED )
2842
_mm512_store_si512((__m512i*)ptr, bgra0.val);
2843
_mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
2844
_mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
2845
_mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
2849
_mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2850
_mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
2851
_mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
2852
_mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
2856
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& a,
const
v_uint16x32& b,
2857
const
v_uint16x32& c,
const
v_uint16x32& d,
2858
hal::StoreMode mode=hal::STORE_UNALIGNED )
2860
v_uint16x32 br01, br23, ga01, ga23;
2861
v_zip(a, c, br01, br23);
2862
v_zip(b, d, ga01, ga23);
2863
v_uint16x32 bgra0, bgra1, bgra2, bgra3;
2864
v_zip(br01, ga01, bgra0, bgra1);
2865
v_zip(br23, ga23, bgra2, bgra3);
2867
if( mode == hal::STORE_ALIGNED_NOCACHE )
2869
_mm512_stream_si512((__m512i*)ptr, bgra0.val);
2870
_mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
2871
_mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
2872
_mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
2874
else
if( mode == hal::STORE_ALIGNED )
2876
_mm512_store_si512((__m512i*)ptr, bgra0.val);
2877
_mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
2878
_mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
2879
_mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
2883
_mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2884
_mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
2885
_mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
2886
_mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
2890
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& a,
const
v_uint32x16& b,
2891
const
v_uint32x16& c,
const
v_uint32x16& d,
2892
hal::StoreMode mode=hal::STORE_UNALIGNED )
2894
v_uint32x16 br01, br23, ga01, ga23;
2895
v_zip(a, c, br01, br23);
2896
v_zip(b, d, ga01, ga23);
2897
v_uint32x16 bgra0, bgra1, bgra2, bgra3;
2898
v_zip(br01, ga01, bgra0, bgra1);
2899
v_zip(br23, ga23, bgra2, bgra3);
2901
if( mode == hal::STORE_ALIGNED_NOCACHE )
2903
_mm512_stream_si512((__m512i*)ptr, bgra0.val);
2904
_mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
2905
_mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
2906
_mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
2908
else
if( mode == hal::STORE_ALIGNED )
2910
_mm512_store_si512((__m512i*)ptr, bgra0.val);
2911
_mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
2912
_mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
2913
_mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
2917
_mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2918
_mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
2919
_mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
2920
_mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
2924
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& a,
const
v_uint64x8& b,
2925
const
v_uint64x8& c,
const
v_uint64x8& d,
2926
hal::StoreMode mode=hal::STORE_UNALIGNED )
2928
v_uint64x8 br01, br23, ga01, ga23;
2929
v_zip(a, c, br01, br23);
2930
v_zip(b, d, ga01, ga23);
2931
v_uint64x8 bgra0, bgra1, bgra2, bgra3;
2932
v_zip(br01, ga01, bgra0, bgra1);
2933
v_zip(br23, ga23, bgra2, bgra3);
2935
if( mode == hal::STORE_ALIGNED_NOCACHE )
2937
_mm512_stream_si512((__m512i*)ptr, bgra0.val);
2938
_mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
2939
_mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
2940
_mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
2942
else
if( mode == hal::STORE_ALIGNED )
2944
_mm512_store_si512((__m512i*)ptr, bgra0.val);
2945
_mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
2946
_mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
2947
_mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
2951
_mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2952
_mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
2953
_mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
2954
_mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
2958
#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2959
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2962
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2963
a0 = v_reinterpret_as_##suffix0(a1); \
2964
b0 = v_reinterpret_as_##suffix0(b1); \
2966
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2968
_Tpvec1 a1, b1, c1; \
2969
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2970
a0 = v_reinterpret_as_##suffix0(a1); \
2971
b0 = v_reinterpret_as_##suffix0(b1); \
2972
c0 = v_reinterpret_as_##suffix0(c1); \
2974
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2976
_Tpvec1 a1, b1, c1, d1; \
2977
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2978
a0 = v_reinterpret_as_##suffix0(a1); \
2979
b0 = v_reinterpret_as_##suffix0(b1); \
2980
c0 = v_reinterpret_as_##suffix0(c1); \
2981
d0 = v_reinterpret_as_##suffix0(d1); \
2983
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2984
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2986
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2987
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2988
v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2990
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2991
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2993
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2994
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2995
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2996
v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2998
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2999
const _Tpvec0& c0, const _Tpvec0& d0, \
3000
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3002
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3003
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3004
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3005
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3006
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3009OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
3010OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32,
short, s16, v_uint16x32, ushort, u16)
3011OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16,
int, s32, v_uint32x16,
unsigned, u32)
3012OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16,
float, f32, v_uint32x16,
unsigned, u32)
3013OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
3014OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8,
double, f64, v_uint64x8, uint64, u64)
3019
inline
int64
v_signmask(
const
v_int8x64& a) {
return
(int64)_mm512_movepi8_mask(a.val); }
3020
inline
int
v_signmask(
const
v_int16x32& a) {
return
(
int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3021
inline
int
v_signmask(
const
v_int32x16& a) {
return
(
int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3022
inline
int
v_signmask(
const
v_int64x8& a) {
return
(
int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3032
inline
bool
v_check_all(
const
v_int8x64& a) {
return
!(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3033
inline
bool
v_check_any(
const
v_int8x64& a) {
return
(
bool)_mm512_movepi8_mask(a.val); }
3034
inline
bool
v_check_all(
const
v_int16x32& a) {
return
!(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3035
inline
bool
v_check_any(
const
v_int16x32& a) {
return
(
bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3036
inline
bool
v_check_all(
const
v_int32x16& a) {
return
!(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3037
inline
bool
v_check_any(
const
v_int32x16& a) {
return
(
bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3038
inline
bool
v_check_all(
const
v_int64x8& a) {
return
!(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3039
inline
bool
v_check_any(
const
v_int64x8& a) {
return
(
bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3056
int64 mask = _mm512_movepi8_mask(a.val);
3057
int
mask32 = (int)mask;
3058
return
mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
3063
inline
int
v_scan_forward(
const
v_int32x16& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3064
inline
int
v_scan_forward(
const
v_uint32x16& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3065
inline
int
v_scan_forward(
const
v_float32x16& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3067
inline
int
v_scan_forward(
const
v_uint64x8& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 4; }
3068
inline
int
v_scan_forward(
const
v_float64x8& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 4; }
3070
inline
void
v512_cleanup() { _mm256_zeroall(); }
3072CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition:
intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition:
intrin_cpp.hpp:1557
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition:
intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition:
intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition:
intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition:
intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition:
intrin_cpp.hpp:2118
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition:
intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition:
intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition:
intrin_cpp.hpp:2046
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75