cpp/ja/intrin__avx512_8hpp_source.html

// This file is part of OpenCV project.


// It is subject to the license terms in the LICENSE file found in the top-level directory


// of this distribution and at http://opencv.org/license.html


#ifndef OPENCV_HAL_INTRIN_AVX512_HPP


#define OPENCV_HAL_INTRIN_AVX512_HPP


#if defined(_MSC_VER) && (_MSC_VER < 1920
/*MSVS2019*/
)


# pragma warning(disable:4146)

// unary minus operator applied to unsigned type, result still unsigned


# pragma warning(disable:4309)

// 'argument': truncation of constant value


# pragma warning(disable:4310)

// cast truncates constant value


#endif


#define CVT_ROUND_MODES_IMPLEMENTED 0


#define CV_SIMD512 1


#define CV_SIMD512_64F 1


#define CV_SIMD512_FP16 0

// no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)


#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))


#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \


_mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \


((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))


#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \


a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \


_v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \


((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \


((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \


((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))


#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \


a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \


a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \


a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \


_v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \


((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \


((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \


((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \


((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \


((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \


((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \


((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))


#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \


a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \


a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \


a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \


_v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \


(uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \


(uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \


(uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \


(uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \


(uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \


(uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \


(uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))


#ifndef _mm512_cvtpd_pslo


#ifdef _mm512_zextsi256_si512


#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))


#else


//if preferred way to extend with zeros is unavailable


#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))


#endif


#endif


namespace


{


inline
__m512i _v512_combine(const
__m256i& lo,
const
__m256i& hi)


{
return
_mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }


inline
__m512 _v512_combine(const
__m256& lo,
const
__m256& hi)


{
return
_mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }


inline
__m512d _v512_combine(const
__m256d& lo,
const
__m256d& hi)


{
return
_mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }


inline
int
_v_cvtsi512_si32(const
__m512i& a)


{
return
_mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }


inline
__m256i _v512_extract_high(const
__m512i& v)


{
return
_mm512_extracti32x8_epi32(v, 1); }


inline
__m256  _v512_extract_high(const
__m512& v)


{
return
_mm512_extractf32x8_ps(v, 1); }


inline
__m256d _v512_extract_high(const
__m512d& v)


{
return
_mm512_extractf64x4_pd(v, 1); }


inline
__m256i _v512_extract_low(const
__m512i& v)


{
return
_mm512_castsi512_si256(v); }


inline
__m256  _v512_extract_low(const
__m512& v)


{
return
_mm512_castps512_ps256(v); }


inline
__m256d _v512_extract_low(const
__m512d& v)


{
return
_mm512_castpd512_pd256(v); }


inline
__m512i _v512_insert(const
__m512i& a,
const
__m256i& b)


{
return
_mm512_inserti32x8(a, b, 0); }


inline
__m512 _v512_insert(const
__m512& a,
const
__m256& b)


{
return
_mm512_insertf32x8(a, b, 0); }


inline
__m512d _v512_insert(const
__m512d& a,
const
__m256d& b)


{
return
_mm512_insertf64x4(a, b, 0); }


}


namespace

cv


{


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN


struct
v_uint8x64


{


typedef
uchar lane_type;


enum
{ nlanes = 64 };


__m512i val;


explicit
v_uint8x64(__m512i v) : val(v) {}


v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,


uchar v4,  uchar v5,  uchar v6,  uchar v7,


uchar v8,  uchar v9,  uchar v10, uchar v11,


uchar v12, uchar v13, uchar v14, uchar v15,


uchar v16, uchar v17, uchar v18, uchar v19,


uchar v20, uchar v21, uchar v22, uchar v23,


uchar v24, uchar v25, uchar v26, uchar v27,


uchar v28, uchar v29, uchar v30, uchar v31,


uchar v32, uchar v33, uchar v34, uchar v35,


uchar v36, uchar v37, uchar v38, uchar v39,


uchar v40, uchar v41, uchar v42, uchar v43,


uchar v44, uchar v45, uchar v46, uchar v47,


uchar v48, uchar v49, uchar v50, uchar v51,


uchar v52, uchar v53, uchar v54, uchar v55,


uchar v56, uchar v57, uchar v58, uchar v59,


uchar v60, uchar v61, uchar v62, uchar v63)


{


val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,


v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,


v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,


v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);


}


v_uint8x64() {}


static
inline
v_uint8x64 zero() {
return
v_uint8x64(_mm512_setzero_si512()); }


uchar get0()
const
{
return
(uchar)_v_cvtsi512_si32(val); }


};


struct
v_int8x64


{


typedef
schar lane_type;


enum
{ nlanes = 64 };


__m512i val;


explicit
v_int8x64(__m512i v) : val(v) {}


v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,


schar v4,  schar v5,  schar v6,  schar v7,


schar v8,  schar v9,  schar v10, schar v11,


schar v12, schar v13, schar v14, schar v15,


schar v16, schar v17, schar v18, schar v19,


schar v20, schar v21, schar v22, schar v23,


schar v24, schar v25, schar v26, schar v27,


schar v28, schar v29, schar v30, schar v31,


schar v32, schar v33, schar v34, schar v35,


schar v36, schar v37, schar v38, schar v39,


schar v40, schar v41, schar v42, schar v43,


schar v44, schar v45, schar v46, schar v47,


schar v48, schar v49, schar v50, schar v51,


schar v52, schar v53, schar v54, schar v55,


schar v56, schar v57, schar v58, schar v59,


schar v60, schar v61, schar v62, schar v63)


{


val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,


v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,


v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,


v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);


}


v_int8x64() {}


static
inline
v_int8x64 zero() {
return
v_int8x64(_mm512_setzero_si512()); }


schar get0()
const
{
return
(schar)_v_cvtsi512_si32(val); }


};


struct
v_uint16x32


{


typedef
ushort lane_type;


enum
{ nlanes = 32 };


__m512i val;


explicit
v_uint16x32(__m512i v) : val(v) {}


v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,


ushort v4,  ushort v5,  ushort v6,  ushort v7,


ushort v8,  ushort v9,  ushort v10, ushort v11,


ushort v12, ushort v13, ushort v14, ushort v15,


ushort v16, ushort v17, ushort v18, ushort v19,


ushort v20, ushort v21, ushort v22, ushort v23,


ushort v24, ushort v25, ushort v26, ushort v27,


ushort v28, ushort v29, ushort v30, ushort v31)


{


val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,


v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);


}


v_uint16x32() {}


static
inline
v_uint16x32 zero() {
return
v_uint16x32(_mm512_setzero_si512()); }


ushort get0()
const
{
return
(ushort)_v_cvtsi512_si32(val); }


};


struct
v_int16x32


{


typedef
short
lane_type;


enum
{ nlanes = 32 };


__m512i val;


explicit
v_int16x32(__m512i v) : val(v) {}


v_int16x32(short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7,


short
v8,
short
v9,
short
v10,
short
v11,
short
v12,
short
v13,
short
v14,
short
v15,


short
v16,
short
v17,
short
v18,
short
v19,
short
v20,
short
v21,
short
v22,
short
v23,


short
v24,
short
v25,
short
v26,
short
v27,
short
v28,
short
v29,
short
v30,
short
v31)


{


val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,


(ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,


(ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,


(ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);


}


v_int16x32() {}


static
inline
v_int16x32 zero() {
return
v_int16x32(_mm512_setzero_si512()); }


short
get0()
const
{
return
(short)_v_cvtsi512_si32(val); }


};


struct
v_uint32x16


{


typedef
unsigned
lane_type;


enum
{ nlanes = 16 };


__m512i val;


explicit
v_uint32x16(__m512i v) : val(v) {}


v_uint32x16(unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3,


unsigned
v4,
unsigned
v5,
unsigned
v6,
unsigned
v7,


unsigned
v8,
unsigned
v9,
unsigned
v10,
unsigned
v11,


unsigned
v12,
unsigned
v13,
unsigned
v14,
unsigned
v15)


{


val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,


(int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);


}


v_uint32x16() {}


static
inline
v_uint32x16 zero() {
return
v_uint32x16(_mm512_setzero_si512()); }


unsigned
get0()
const
{
return
(unsigned)_v_cvtsi512_si32(val); }


};


struct
v_int32x16


{


typedef
int
lane_type;


enum
{ nlanes = 16 };


__m512i val;


explicit
v_int32x16(__m512i v) : val(v) {}


v_int32x16(int
v0,
int
v1,
int
v2,
int
v3,
int
v4,
int
v5,
int
v6,
int
v7,


int
v8,
int
v9,
int
v10,
int
v11,
int
v12,
int
v13,
int
v14,
int
v15)


{


val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);


}


v_int32x16() {}


static
inline
v_int32x16 zero() {
return
v_int32x16(_mm512_setzero_si512()); }


int
get0()
const
{
return
_v_cvtsi512_si32(val); }


};


struct
v_float32x16


{


typedef
float
lane_type;


enum
{ nlanes = 16 };


__m512 val;


explicit
v_float32x16(__m512 v) : val(v) {}


v_float32x16(float
v0,
float
v1,
float
v2,
float
v3,
float
v4,
float
v5,
float
v6,
float
v7,


float
v8,
float
v9,
float
v10,
float
v11,
float
v12,
float
v13,
float
v14,
float
v15)


{


val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);


}


v_float32x16() {}


static
inline
v_float32x16 zero() {
return
v_float32x16(_mm512_setzero_ps()); }


float
get0()
const
{
return
_mm_cvtss_f32(_mm512_castps512_ps128(val)); }


};


struct
v_uint64x8


{


typedef
uint64 lane_type;


enum
{ nlanes = 8 };


__m512i val;


explicit
v_uint64x8(__m512i v) : val(v) {}


v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)


{ val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }


v_uint64x8() {}


static
inline
v_uint64x8 zero() {
return
v_uint64x8(_mm512_setzero_si512()); }


uint64 get0()
const


{


#if defined __x86_64__ || defined _M_X64


return
(uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));


#else


int
a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));


int
b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));


return
(unsigned)a | ((uint64)(unsigned)b << 32);


#endif


}


};


struct
v_int64x8


{


typedef
int64 lane_type;


enum
{ nlanes = 8 };


__m512i val;


explicit
v_int64x8(__m512i v) : val(v) {}


v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)


{ val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }


v_int64x8() {}


static
inline
v_int64x8 zero() {
return
v_int64x8(_mm512_setzero_si512()); }


int64 get0()
const


{


#if defined __x86_64__ || defined _M_X64


return
(int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));


#else


int
a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));


int
b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));


return
(int64)((unsigned)a | ((uint64)(unsigned)b << 32));


#endif


}


};


struct
v_float64x8


{


typedef
double
lane_type;


enum
{ nlanes = 8 };


__m512d val;


explicit
v_float64x8(__m512d v) : val(v) {}


v_float64x8(double
v0,
double
v1,
double
v2,
double
v3,
double
v4,
double
v5,
double
v6,
double
v7)


{ val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }


v_float64x8() {}


static
inline
v_float64x8 zero() {
return
v_float64x8(_mm512_setzero_pd()); }


double
get0()
const
{
return
_mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }


};


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \


inline _Tpvec v512_load(const _Tp* ptr)                           \


{ return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \


inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \


{ return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \


inline _Tpvec v512_load_low(const _Tp* ptr)                       \


{                                                                 \


__m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \


return _Tpvec(_mm512_castsi256_si512(v256));                  \


}                                                                 \


inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \


{                                                                 \


__m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \


__m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \


return _Tpvec(_v512_combine(vlo, vhi));                       \


}                                                                 \


inline void v_store(_Tp* ptr, const _Tpvec& a)                    \


{ _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \


inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \


{ _mm512_store_si512((__m512i*)ptr, a.val); }                     \


inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \


{ _mm512_stream_si512((__m512i*)ptr, a.val); }                    \


inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \


{ \


if( mode == hal::STORE_UNALIGNED ) \


_mm512_storeu_si512((__m512i*)ptr, a.val); \


else if( mode == hal::STORE_ALIGNED_NOCACHE )  \


_mm512_stream_si512((__m512i*)ptr, a.val); \


else \


_mm512_store_si512((__m512i*)ptr, a.val); \


} \


inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \


{ _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \


inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \


{ _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,
short)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,
unsigned)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,
int)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \


inline _Tpvec v512_load(const _Tp* ptr)                               \


{ return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \


inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \


{ return _Tpvec(_mm512_load_##suffix(ptr)); }                         \


inline _Tpvec v512_load_low(const _Tp* ptr)                           \


{                                                                     \


return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \


(_mm256_loadu_##suffix(ptr)));                       \


}                                                                     \


inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \


{                                                                     \


halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \


halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \


return _Tpvec(_v512_combine(vlo, vhi));                           \


}                                                                     \


inline void v_store(_Tp* ptr, const _Tpvec& a)                        \


{ _mm512_storeu_##suffix(ptr, a.val); }                               \


inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \


{ _mm512_store_##suffix(ptr, a.val); }                                \


inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \


{ _mm512_stream_##suffix(ptr, a.val); }                               \


inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \


{ \


if( mode == hal::STORE_UNALIGNED ) \


_mm512_storeu_##suffix(ptr, a.val); \


else if( mode == hal::STORE_ALIGNED_NOCACHE )  \


_mm512_stream_##suffix(ptr, a.val); \


else \


_mm512_store_##suffix(ptr, a.val); \


} \


inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \


{ _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \


inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \


{ _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16,
float,  ps, __m256)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8,
double, pd, __m256d)


#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \


inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \


{ return _Tpvec(cast(a.val)); }


#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \


inline _Tpvec v512_setzero_##suffix()                                          \


{ return _Tpvec(_mm512_setzero_si512()); }                                     \


inline _Tpvec v512_setall_##suffix(_Tp v)                                      \


{ return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)


OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,
char)


OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,
char)


OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,
short)


OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,
short,    s16, epi16,
short)


OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16,
unsigned, u32, epi32,
int)


OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,
int,      s32, epi32,
int)


OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)


OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)


#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \


inline _Tpvec v512_setzero_##suffix()                                   \


{ return _Tpvec(_mm512_setzero_##zsuffix()); }                          \


inline _Tpvec v512_setall_##suffix(_Tp v)                               \


{ return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)


OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16,
float,  f32, ps, _mm512_castsi512_ps)


OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,
double, f64, pd, _mm512_castsi512_pd)


inline
v_float32x16 v_reinterpret_as_f32(const
v_float32x16& a)


{
return
a; }


inline
v_float32x16 v_reinterpret_as_f32(const
v_float64x8& a)


{
return
v_float32x16(_mm512_castpd_ps(a.val)); }


inline
v_float64x8 v_reinterpret_as_f64(const
v_float64x8& a)


{
return
a; }


inline
v_float64x8 v_reinterpret_as_f64(const
v_float32x16& a)


{
return
v_float64x8(_mm512_castps_pd(a.val)); }


// FP16


inline
v_float32x16 v512_load_expand(const
float16_t* ptr)


{


return
v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const
__m256i*)ptr)));


}


inline
void
v_pack_store(float16_t* ptr,
const
v_float32x16& a)


{


__m256i ah = _mm512_cvtps_ph(a.val, 0);


_mm256_storeu_si256((__m256i*)ptr, ah);


}


/* Recombine & ZIP */


inline
void
v_zip(const
v_int8x64& a,
const
v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)


{


#if CV_AVX_512VBMI


__m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,


87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,


79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,


71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);


ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));


__m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,


119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,


111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,


103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);


ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));


#else


__m512i low  = _mm512_unpacklo_epi8(a.val, b.val);


__m512i high = _mm512_unpackhi_epi8(a.val, b.val);


ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));


ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));


#endif


}


inline
void
v_zip(const
v_int16x32& a,
const
v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)


{


__m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,


39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);


ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));


__m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,


55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);


ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));


}


inline
void
v_zip(const
v_int32x16& a,
const
v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)


{


__m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);


ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));


__m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);


ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));


}


inline
void
v_zip(const
v_int64x8& a,
const
v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)


{


__m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);


ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));


__m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);


ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));


}


inline
void
v_zip(const
v_uint8x64&  a,
const
v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)


{


v_int8x64 i0, i1;


v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);


ab0 = v_reinterpret_as_u8(i0);


ab1 = v_reinterpret_as_u8(i1);


}


inline
void
v_zip(const
v_uint16x32&  a,
const
v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)


{


v_int16x32 i0, i1;


v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);


ab0 = v_reinterpret_as_u16(i0);


ab1 = v_reinterpret_as_u16(i1);


}


inline
void
v_zip(const
v_uint32x16&  a,
const
v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)


{


v_int32x16 i0, i1;


v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);


ab0 = v_reinterpret_as_u32(i0);


ab1 = v_reinterpret_as_u32(i1);


}


inline
void
v_zip(const
v_uint64x8&  a,
const
v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)


{


v_int64x8 i0, i1;


v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);


ab0 = v_reinterpret_as_u64(i0);


ab1 = v_reinterpret_as_u64(i1);


}


inline
void
v_zip(const
v_float32x16&  a,
const
v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)


{


v_int32x16 i0, i1;


v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);


ab0 = v_reinterpret_as_f32(i0);


ab1 = v_reinterpret_as_f32(i1);


}


inline
void
v_zip(const
v_float64x8&  a,
const
v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)


{


v_int64x8 i0, i1;


v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);


ab0 = v_reinterpret_as_f64(i0);


ab1 = v_reinterpret_as_f64(i1);


}


#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \


inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \


{ return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \


inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \


{ return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \


inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \


_Tpvec& c, _Tpvec& d)                                   \


{                                                                                     \


c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \


d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \


}


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)


OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)


/* Element-wise binary and unary operations */


#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \


inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \


{ return _Tpvec(intrin(a.val, b.val)); }


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)


inline
v_uint8x64 v_mul_wrap(const
v_uint8x64& a,
const
v_uint8x64& b)


{


__m512i ad = _mm512_srai_epi16(a.val, 8);


__m512i bd = _mm512_srai_epi16(b.val, 8);


__m512i p0 = _mm512_mullo_epi16(a.val, b.val);
// even


__m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8);
// odd


return
v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));


}


inline
v_int8x64 v_mul_wrap(const
v_int8x64& a,
const
v_int8x64& b)


{


return
v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));


}


#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \


inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \


{ return _Tpvec(intrin(a.val, b.val)); }                             \


inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \


{ a.val = intrin(a.val, b.val); return a; }


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)


OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)


OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)


OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)


// saturating multiply


inline
v_uint8x64 operator * (const
v_uint8x64& a,
const
v_uint8x64& b)


{


v_uint16x32 c, d;


v_mul_expand(a, b, c, d);


return
v_pack(c, d);


}


inline
v_int8x64 operator * (const
v_int8x64& a,
const
v_int8x64& b)


{


v_int16x32 c, d;


v_mul_expand(a, b, c, d);


return
v_pack(c, d);


}


inline
v_uint16x32 operator * (const
v_uint16x32& a,
const
v_uint16x32& b)


{


__m512i pl = _mm512_mullo_epi16(a.val, b.val);


__m512i ph = _mm512_mulhi_epu16(a.val, b.val);


__m512i p0 = _mm512_unpacklo_epi16(pl, ph);


__m512i p1 = _mm512_unpackhi_epi16(pl, ph);


const
__m512i m = _mm512_set1_epi32(65535);


return
v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));


}


inline
v_int16x32 operator * (const
v_int16x32& a,
const
v_int16x32& b)


{


__m512i pl = _mm512_mullo_epi16(a.val, b.val);


__m512i ph = _mm512_mulhi_epi16(a.val, b.val);


__m512i p0 = _mm512_unpacklo_epi16(pl, ph);


__m512i p1 = _mm512_unpackhi_epi16(pl, ph);


return
v_int16x32(_mm512_packs_epi32(p0, p1));


}


inline
v_uint8x64& operator *= (v_uint8x64& a,
const
v_uint8x64& b)


{ a = a * b;
return
a; }


inline
v_int8x64& operator *= (v_int8x64& a,
const
v_int8x64& b)


{ a = a * b;
return
a; }


inline
v_uint16x32& operator *= (v_uint16x32& a,
const
v_uint16x32& b)


{ a = a * b;
return
a; }


inline
v_int16x32& operator *= (v_int16x32& a,
const
v_int16x32& b)


{ a = a * b;
return
a; }


inline
v_int16x32
v_mul_hi(const
v_int16x32& a,
const
v_int16x32& b) {
return
v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }


inline
v_uint16x32
v_mul_hi(const
v_uint16x32& a,
const
v_uint16x32& b) {
return
v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }


//  Multiply and expand


inline
void
v_mul_expand(const
v_uint8x64& a,
const
v_uint8x64& b,


v_uint16x32& c, v_uint16x32& d)


{


v_uint16x32 a0, a1, b0, b1;


v_expand(a, a0, a1);


v_expand(b, b0, b1);


c = v_mul_wrap(a0, b0);


d = v_mul_wrap(a1, b1);


}


inline
void
v_mul_expand(const
v_int8x64& a,
const
v_int8x64& b,


v_int16x32& c, v_int16x32& d)


{


v_int16x32 a0, a1, b0, b1;


v_expand(a, a0, a1);


v_expand(b, b0, b1);


c = v_mul_wrap(a0, b0);


d = v_mul_wrap(a1, b1);


}


inline
void
v_mul_expand(const
v_int16x32& a,
const
v_int16x32& b,


v_int32x16& c, v_int32x16& d)


{


v_int16x32 v0, v1;


v_zip(v_mul_wrap(a, b),
v_mul_hi(a, b), v0, v1);


c = v_reinterpret_as_s32(v0);


d = v_reinterpret_as_s32(v1);


}


inline
void
v_mul_expand(const
v_uint16x32& a,
const
v_uint16x32& b,


v_uint32x16& c, v_uint32x16& d)


{


v_uint16x32 v0, v1;


v_zip(v_mul_wrap(a, b),
v_mul_hi(a, b), v0, v1);


c = v_reinterpret_as_u32(v0);


d = v_reinterpret_as_u32(v1);


}


inline
void
v_mul_expand(const
v_uint32x16& a,
const
v_uint32x16& b,


v_uint64x8& c, v_uint64x8& d)


{


v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),


v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);


}


inline
void
v_mul_expand(const
v_int32x16& a,
const
v_int32x16& b,


v_int64x8& c, v_int64x8& d)


{


v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),


v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);


}


#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \


inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \


{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \


inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \


{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \


inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \


{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \


inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \


{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \


template<int imm>                                             \


inline _Tpuvec v_shl(const _Tpuvec& a)                        \


{ return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \


template<int imm>                                             \


inline _Tpsvec v_shl(const _Tpsvec& a)                        \


{ return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \


template<int imm>                                             \


inline _Tpuvec v_shr(const _Tpuvec& a)                        \


{ return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \


template<int imm>                                             \


inline _Tpsvec v_shr(const _Tpsvec& a)                        \


{ return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }


OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)


OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)


OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)


#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \


OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \


OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \


OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \


inline _Tpvec operator ~ (const _Tpvec& a)                     \


{ return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))


#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \


inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \


{ return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }


OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)


OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)


OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)


#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \


inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \


{ return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }


#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \


OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)


#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \


inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \


{ return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }


#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \


OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)


OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)


OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)


inline v_float32x16
v_not_nan(const v_float32x16& a)


{
return
v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }


inline
v_float64x8
v_not_nan(const
v_float64x8& a)


{
return
v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)


namespace
{


template<bool
prec,
int
imm4,
bool
part,
int
imm32>


struct
_v_rotate_right {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64&) {
return
v_int8x64(); }};


template<int
imm4,
int
imm32>


struct
_v_rotate_right<true, imm4, false, imm32> {
static
inline
v_int8x64 eval(const
v_int8x64& a,
const
v_int8x64& b)


{


return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),


_mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));


}};


template<int
imm4>


struct
_v_rotate_right<true, imm4, false, 15> {
static
inline
v_int8x64 eval(const
v_int8x64& a,
const
v_int8x64& b)


{


return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),


_mm512_slli_epi32(                                b.val, (4-imm4)*8)));


}};


template<int
imm4,
int
imm32>


struct
_v_rotate_right<true, imm4, true, imm32> {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64& b)


{


return
v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),


_mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));


}};


template<int
imm4>


struct
_v_rotate_right<true, imm4, true, 31> {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64& b)


{
return
v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};


template<int
imm32>


struct
_v_rotate_right<false, 0, false, imm32> {
static
inline
v_int8x64 eval(const
v_int8x64& a,
const
v_int8x64& b)


{
return
v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};


template<>


struct
_v_rotate_right<false, 0, false, 0> {
static
inline
v_int8x64 eval(const
v_int8x64& a,
const
v_int8x64&) {
return
a; }};


template<int
imm32>


struct
_v_rotate_right<false, 0, true, imm32> {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64& b)


{
return
v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};


template<>


struct
_v_rotate_right<false, 0, true, 16> {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64& b) {
return
b; }};


template<>


struct
_v_rotate_right<false, 0, true, 32> {
static
inline
v_int8x64 eval(const
v_int8x64&,
const
v_int8x64&) {
return
v_int8x64(); }};


}


template<int
imm>
inline
v_int8x64 v_rotate_right(const
v_int8x64& a,
const
v_int8x64& b)


{


return
imm >= 128 ? v_int8x64() :


#if CV_AVX_512VBMI


v_int8x64(_mm512_permutex2var_epi8(a.val,


_v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,


0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,


0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,


0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,


0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,


0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,


0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,


0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));


#else


_v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);


#endif


}


template<int
imm>


inline
v_int8x64 v_rotate_left(const
v_int8x64& a,
const
v_int8x64& b)


{


if
(imm == 0)
return
a;


if
(imm == 64)
return
b;


if
(imm >= 128)
return
v_int8x64();


#if CV_AVX_512VBMI


return
v_int8x64(_mm512_permutex2var_epi8(b.val,


_v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,


0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,


0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,


0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,


0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,


0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,


0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,


0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));


#else


return
imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);


#endif


}


template<int
imm>


inline
v_int8x64 v_rotate_right(const
v_int8x64& a)


{


if
(imm == 0)
return
a;


if
(imm >= 64)
return
v_int8x64();


#if CV_AVX_512VBMI


return
v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,


_v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,


0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,


0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,


0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,


0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,


0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,


0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,


0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));


#else


return
v_rotate_right<imm>(a, v512_setzero_s8());


#endif


}


template<int
imm>


inline
v_int8x64 v_rotate_left(const
v_int8x64& a)


{


if
(imm == 0)
return
a;


if
(imm >= 64)
return
v_int8x64();


#if CV_AVX_512VBMI


return
v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,


_v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,


0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,


0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,


0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,


0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,


0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,


0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,


0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));


#else


return
v_rotate_right<64 - imm>(v512_setzero_s8(), a);


#endif


}


#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \


template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \


{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \


template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \


{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \


template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \


{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \


template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \


{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }


#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \


template<int imm>                                                                                                                          \


inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \


{                                                                                                                                          \


enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \


enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \


if (imm == 0) return a;                                                                                                                \


if (imm == _Tpvec::nlanes) return b;                                                                                                   \


if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \


return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \


}                                                                                                                                          \


template<int imm>                                                                                                                          \


inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \


{                                                                                                                                          \


enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \


enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \


if (imm == 0) return a;                                                                                                                \


if (imm == _Tpvec::nlanes) return b;                                                                                                   \


if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \


return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \


}                                                                                                                                          \


template<int imm>                                                                                                                          \


inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \


{                                                                                                                                          \


if (imm == 0) return a;                                                                                                                \


if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \


return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \


}                                                                                                                                          \


template<int imm>                                                                                                                          \


inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \


{                                                                                                                                          \


if (imm == 0) return a;                                                                                                                \


if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \


return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \


}


OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)


OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)


OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)


OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)


inline
v_uint8x64
v_reverse(const
v_uint8x64 &a)


{


#if CV_AVX_512VBMI


static
const
__m512i perm = _mm512_set_epi32(


0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,


0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,


0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,


0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);


return
v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));


#else


static
const
__m512i shuf = _mm512_set_epi32(


0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,


0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,


0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,


0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);


static
const
__m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);


__m512i vec = _mm512_shuffle_epi8(a.val, shuf);


return
v_uint8x64(_mm512_permutexvar_epi64(perm, vec));


#endif


}


inline
v_int8x64
v_reverse(const
v_int8x64 &a)


{
return
v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }


inline
v_uint16x32
v_reverse(const
v_uint16x32 &a)


{


#if CV_AVX_512VBMI


static
const
__m512i perm = _mm512_set_epi32(


0x00000001, 0x00020003, 0x00040005, 0x00060007,


0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,


0x00100011, 0x00120013, 0x00140015, 0x00160017,


0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);


return
v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));


#else


static
const
__m512i shuf = _mm512_set_epi32(


0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,


0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,


0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,


0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);


static
const
__m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);


__m512i vec = _mm512_shuffle_epi8(a.val, shuf);


return
v_uint16x32(_mm512_permutexvar_epi64(perm, vec));


#endif


}


inline
v_int16x32
v_reverse(const
v_int16x32 &a)


{
return
v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }


inline
v_uint32x16
v_reverse(const
v_uint32x16 &a)


{


static
const
__m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);


return
v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));


}


inline
v_int32x16
v_reverse(const
v_int32x16 &a)


{
return
v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }


inline
v_float32x16
v_reverse(const
v_float32x16 &a)


{
return
v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }


inline
v_uint64x8
v_reverse(const
v_uint64x8 &a)


{


static
const
__m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);


return
v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));


}


inline
v_int64x8
v_reverse(const
v_int64x8 &a)


{
return
v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }


inline
v_float64x8
v_reverse(const
v_float64x8 &a)


{
return
v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b


#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \


inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \


{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \


sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \


_mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \


return scop(idx[0], idx[1]); }


OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64,
min, v_uint64x8, min_epu64,
min)


OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64,
max, v_uint64x8, max_epu64,
max)


OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)


OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,
min, v_int64x8,  min_epi64,
min)


OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,
max, v_int64x8,  max_epi64,
max)


OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \


inline double v_reduce_##func(const v_float64x8& a)                                             \


{ __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \


double CV_DECL_ALIGNED(64) idx[2];                                                            \


_mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \


return scop(idx[0], idx[1]); }


OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd,
min)


OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd,
max)


OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \


inline sctype v_reduce_##func(const _Tpvec& a)                                                    \


{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \


__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \


return (sctype)_mm_cvtsi128_si32(quarter); }


OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint,
min, v_uint32x16, min_epu32)


OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint,
max, v_uint32x16, max_epu32)


OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,
min, v_int32x16,  min_epi32)


OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,
max, v_int32x16,  max_epi32)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \


inline float v_reduce_##func(const v_float32x16& a)                                           \


{ __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \


__m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \


quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \


quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \


return _mm_cvtss_f32(quarter); }


OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)


OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)


inline
float
v_reduce_sum(const
v_float32x16& a)


{


__m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));


__m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));


quarter = _mm_hadd_ps(quarter, quarter);


return
_mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));


}


inline
int
v_reduce_sum(const
v_int32x16& a)


{


__m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));


__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));


quarter = _mm_hadd_epi32(quarter, quarter);


return
_mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));


}


inline
uint
v_reduce_sum(const
v_uint32x16& a)


{
return
(uint)v_reduce_sum(v_reinterpret_as_s32(a)); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \


inline sctype v_reduce_##func(const _Tpvec& a)                                                    \


{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \


__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \


return (sctype)_mm_cvtsi128_si32(quarter); }


OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort,
min, v_uint16x32, min_epu16)


OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort,
max, v_uint16x32, max_epu16)


OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,
min, v_int16x32,  min_epi16)


OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,
max, v_int16x32,  max_epi16)


inline
int
v_reduce_sum(const
v_int16x32& a)


{
return
v_reduce_sum(v_expand_low(a) +
v_expand_high(a)); }


inline
uint
v_reduce_sum(const
v_uint16x32& a)


{
return
v_reduce_sum(v_expand_low(a) +
v_expand_high(a)); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \


inline sctype v_reduce_##func(const _Tpvec& a)                                                    \


{ __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \


__m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \


quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \


return (sctype)_mm_cvtsi128_si32(quarter); }


OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar,
min, v_uint8x64, min_epu8)


OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar,
max, v_uint8x64, max_epu8)


OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar,
min, v_int8x64,  min_epi8)


OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar,
max, v_int8x64,  max_epi8)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \


inline sctype v_reduce_sum(const _Tpvec& a)                                                         \


{   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \


_mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \


a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \


__m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \


__m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \


a4 = _mm_hadd_epi32(a4, a4);                                                                    \


return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }


OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)


OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)


inline
v_float32x16
v_reduce_sum4(const
v_float32x16& a,
const
v_float32x16& b,


const
v_float32x16& c,
const
v_float32x16& d)


{


__m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));


__m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));


__m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));


__m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));


return
v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));


}


inline
unsigned
v_reduce_sad(const
v_uint8x64& a,
const
v_uint8x64& b)


{


__m512i val = _mm512_sad_epu8(a.val, b.val);


__m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));


__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));


}


inline
unsigned
v_reduce_sad(const
v_int8x64& a,
const
v_int8x64& b)


{


__m512i val = _mm512_set1_epi8(-128);


val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));


__m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));


__m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));


}


inline
unsigned
v_reduce_sad(const
v_uint16x32& a,
const
v_uint16x32& b)


{
return
v_reduce_sum(v_add_wrap(a - b, b - a)); }


inline
unsigned
v_reduce_sad(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }


inline
unsigned
v_reduce_sad(const
v_uint32x16& a,
const
v_uint32x16& b)


{
return
v_reduce_sum(v_max(a, b) - v_min(a, b)); }


inline
unsigned
v_reduce_sad(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }


inline
float
v_reduce_sad(const
v_float32x16& a,
const
v_float32x16& b)


{
return
v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }


inline
double
v_reduce_sad(const
v_float64x8& a,
const
v_float64x8& b)


{
return
v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }


inline
v_uint8x64
v_popcount(const
v_int8x64& a)


{


#if CV_AVX_512BITALG


return
v_uint8x64(_mm512_popcnt_epi8(a.val));


#elif CV_AVX_512VBMI


__m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,


5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,


5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,


4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);


__m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,


6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,


6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,


5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);


return
v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));


#else


__m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);


__m512i _popcnt_mask = _mm512_set1_epi8(0x0F);


return
v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),


_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));


#endif


}


inline
v_uint16x32
v_popcount(const
v_int16x32& a)


{


#if CV_AVX_512BITALG


return
v_uint16x32(_mm512_popcnt_epi16(a.val));


#elif CV_AVX_512VPOPCNTDQ


__m512i zero = _mm512_setzero_si512();


return
v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),


_mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));


#else


v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));


p += v_rotate_right<1>(p);


return
v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);


#endif


}


inline
v_uint32x16
v_popcount(const
v_int32x16& a)


{


#if CV_AVX_512VPOPCNTDQ


return
v_uint32x16(_mm512_popcnt_epi32(a.val));


#else


v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));


p += v_rotate_right<1>(p);


p += v_rotate_right<2>(p);


return
v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);


#endif


}


inline
v_uint64x8
v_popcount(const
v_int64x8& a)


{


#if CV_AVX_512VPOPCNTDQ


return
v_uint64x8(_mm512_popcnt_epi64(a.val));


#else


return
v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));


#endif


}


inline
v_uint8x64
v_popcount(const
v_uint8x64&  a) {
return
v_popcount(v_reinterpret_as_s8 (a)); }


inline
v_uint16x32
v_popcount(const
v_uint16x32& a) {
return
v_popcount(v_reinterpret_as_s16(a)); }


inline
v_uint32x16
v_popcount(const
v_uint32x16& a) {
return
v_popcount(v_reinterpret_as_s32(a)); }


inline
v_uint64x8
v_popcount(const
v_uint64x8&  a) {
return
v_popcount(v_reinterpret_as_s64(a)); }


#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \


inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \


{ return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \


inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \


{ return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \


inline _Tpvec v_sqrt(const _Tpvec& x)                                     \


{ return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \


inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \


{ return v_fma(a, a, b * b); }                                            \


inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \


{ return v_sqrt(v_fma(a, a, b * b)); }


OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)


OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)


inline
v_int32x16
v_fma(const
v_int32x16& a,
const
v_int32x16& b,
const
v_int32x16& c)


{
return
a * b + c; }


inline
v_int32x16
v_muladd(const
v_int32x16& a,
const
v_int32x16& b,
const
v_int32x16& c)


{
return
v_fma(a, b, c); }


inline
v_float32x16
v_invsqrt(const
v_float32x16& x)


{


#if CV_AVX_512ER


return
v_float32x16(_mm512_rsqrt28_ps(x.val));


#else


v_float32x16 half = x * v512_setall_f32(0.5);


v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));


t *= v512_setall_f32(1.5) - ((t * t) * half);


return
t;


#endif


}


inline
v_float64x8
v_invsqrt(const
v_float64x8& x)


{


#if CV_AVX_512ER


return
v_float64x8(_mm512_rsqrt28_pd(x.val));


#else


return
v512_setall_f64(1.) / v_sqrt(x);


//    v_float64x8 half = x * v512_setall_f64(0.5);


//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));


//    t *= v512_setall_f64(1.5) - ((t * t) * half);


//    t *= v512_setall_f64(1.5) - ((t * t) * half);


//    return t;


#endif


}


#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \


inline _Tpuvec v_abs(const _Tpvec& x)                   \


{ return _Tpuvec(_mm512_abs_##suffix(x.val)); }


OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)


OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)


OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)


OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)


inline
v_float32x16 v_abs(const
v_float32x16& x)


{


#ifdef _mm512_abs_pd


return
v_float32x16(_mm512_abs_ps(x.val));


#else


return
v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),


_v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,


0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));


#endif


}


inline
v_float64x8 v_abs(const
v_float64x8& x)


{


#ifdef _mm512_abs_pd


#if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))


// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476


return
v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));


#else


return
v_float64x8(_mm512_abs_pd(x.val));


#endif


#else


return
v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),


_v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,


0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));


#endif


}


inline
v_uint8x64
v_absdiff(const
v_uint8x64& a,
const
v_uint8x64& b)


{
return
v_add_wrap(a - b,  b - a); }


inline
v_uint16x32
v_absdiff(const
v_uint16x32& a,
const
v_uint16x32& b)


{
return
v_add_wrap(a - b,  b - a); }


inline
v_uint32x16
v_absdiff(const
v_uint32x16& a,
const
v_uint32x16& b)


{
return
v_max(a, b) - v_min(a, b); }


inline
v_uint8x64
v_absdiff(const
v_int8x64& a,
const
v_int8x64& b)


{


v_int8x64 d = v_sub_wrap(a, b);


v_int8x64 m = a < b;


return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));


}


inline
v_uint16x32
v_absdiff(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }


inline
v_uint32x16
v_absdiff(const
v_int32x16& a,
const
v_int32x16& b)


{


v_int32x16 d = a - b;


v_int32x16 m = a < b;


return
v_reinterpret_as_u32((d ^ m) - m);


}


inline
v_float32x16
v_absdiff(const
v_float32x16& a,
const
v_float32x16& b)


{
return
v_abs(a - b); }


inline
v_float64x8
v_absdiff(const
v_float64x8& a,
const
v_float64x8& b)


{
return
v_abs(a - b); }


inline
v_int8x64
v_absdiffs(const
v_int8x64& a,
const
v_int8x64& b)


{


v_int8x64 d = a - b;


v_int8x64 m = a < b;


return
(d ^ m) - m;


}


inline
v_int16x32
v_absdiffs(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_max(a, b) - v_min(a, b); }


inline
v_int32x16
v_round(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvtps_epi32(a.val)); }


inline
v_int32x16
v_round(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }


inline
v_int32x16
v_round(const
v_float64x8& a,
const
v_float64x8& b)


{
return
v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }


inline
v_int32x16
v_trunc(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvttps_epi32(a.val)); }


inline
v_int32x16
v_trunc(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }


#if CVT_ROUND_MODES_IMPLEMENTED


inline
v_int32x16
v_floor(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }


inline
v_int32x16
v_floor(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }


inline
v_int32x16
v_ceil(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }


inline
v_int32x16
v_ceil(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }


#else


inline
v_int32x16
v_floor(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }


inline
v_int32x16
v_floor(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }


inline
v_int32x16
v_ceil(const
v_float32x16& a)


{
return
v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }


inline
v_int32x16
v_ceil(const
v_float64x8& a)


{
return
v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }


#endif


inline
v_float32x16
v_cvt_f32(const
v_int32x16& a)


{
return
v_float32x16(_mm512_cvtepi32_ps(a.val)); }


inline
v_float32x16
v_cvt_f32(const
v_float64x8& a)


{
return
v_float32x16(_mm512_cvtpd_pslo(a.val)); }


inline
v_float32x16
v_cvt_f32(const
v_float64x8& a,
const
v_float64x8& b)


{
return
v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }


inline
v_float64x8
v_cvt_f64(const
v_int32x16& a)


{
return
v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }


inline
v_float64x8
v_cvt_f64_high(const
v_int32x16& a)


{
return
v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }


inline
v_float64x8
v_cvt_f64(const
v_float32x16& a)


{
return
v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }


inline
v_float64x8
v_cvt_f64_high(const
v_float32x16& a)


{
return
v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }


// from (Mysticial and wim) https://stackoverflow.com/q/41144668


inline
v_float64x8
v_cvt_f64(const
v_int64x8& v)


{


#if CV_AVX_512DQ


return
v_float64x8(_mm512_cvtepi64_pd(v.val));


#else


// constants encoded as floating-point


__m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000);
// 2^52


__m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000);
// 2^84 + 2^63


__m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000);
// 2^84 + 2^63 + 2^52


__m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);


// Blend the 32 lowest significant bits of v with magic_int_lo


__m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);


// Extract the 32 most significant bits of v


__m512i v_hi         = _mm512_srli_epi64(v.val, 32);


// Flip the msb of v_hi and blend with 0x45300000


v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);


// Compute in double precision


__m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);


// (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition


__m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));


return
v_float64x8(result);


#endif


}


inline
v_int8x64 v512_lut(const
schar* tab,
const
int* idx)


{


__m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx    ), (const
int
*)tab, 1));


__m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx + 1), (const
int
*)tab, 1));


__m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx + 2), (const
int
*)tab, 1));


__m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx + 3), (const
int
*)tab, 1));


return
v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));


}


inline
v_int8x64 v512_lut_pairs(const
schar* tab,
const
int* idx)


{


__m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx    ), (const
int
*)tab, 1));


__m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx + 1), (const
int
*)tab, 1));


return
v_int8x64(_v512_combine(p0, p1));


}


inline
v_int8x64 v512_lut_quads(const
schar* tab,
const
int* idx)


{


return
v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx), (const
int
*)tab, 1));


}


inline
v_uint8x64 v512_lut(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut((const
schar *)tab, idx)); }


inline
v_uint8x64 v512_lut_pairs(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut_pairs((const
schar *)tab, idx)); }


inline
v_uint8x64 v512_lut_quads(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v512_lut_quads((const
schar *)tab, idx)); }


inline
v_int16x32 v512_lut(const
short* tab,
const
int* idx)


{


__m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx    ), (const
int
*)tab, 2));


__m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx + 1), (const
int
*)tab, 2));


return
v_int16x32(_v512_combine(p0, p1));


}


inline
v_int16x32 v512_lut_pairs(const
short* tab,
const
int* idx)


{


return
v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx), (const
int
*)tab, 2));


}


inline
v_int16x32 v512_lut_quads(const
short* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), (const
long
long
int*)tab, 2));


#else


return
v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), (const
int64*)tab, 2));


#endif


}


inline
v_uint16x32 v512_lut(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut((const
short
*)tab, idx)); }


inline
v_uint16x32 v512_lut_pairs(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut_pairs((const
short
*)tab, idx)); }


inline
v_uint16x32 v512_lut_quads(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v512_lut_quads((const
short
*)tab, idx)); }


inline
v_int32x16 v512_lut(const
int* tab,
const
int* idx)


{


return
v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const
__m512i*)idx), tab, 4));


}


inline
v_int32x16 v512_lut_pairs(const
int* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), (const
long
long
int*)tab, 4));


#else


return
v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), (const
int64*)tab, 4));


#endif


}


inline
v_int32x16 v512_lut_quads(const
int* tab,
const
int* idx)


{


return
v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(


_mm_loadu_si128((const
__m128i*)(tab + idx[0]))),


_mm_loadu_si128((const
__m128i*)(tab + idx[1])), 1),


_mm_loadu_si128((const
__m128i*)(tab + idx[2])), 2),


_mm_loadu_si128((const
__m128i*)(tab + idx[3])), 3));


}


inline
v_uint32x16 v512_lut(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut((const
int
*)tab, idx)); }


inline
v_uint32x16 v512_lut_pairs(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut_pairs((const
int
*)tab, idx)); }


inline
v_uint32x16 v512_lut_quads(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v512_lut_quads((const
int
*)tab, idx)); }


inline
v_int64x8 v512_lut(const
int64* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), (const
long
long
int*)tab, 8));


#else


return
v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const
__m256i*)idx), tab , 8));


#endif


}


inline
v_int64x8 v512_lut_pairs(const
int64* tab,
const
int* idx)


{


return
v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(


_mm_loadu_si128((const
__m128i*)(tab + idx[0]))),


_mm_loadu_si128((const
__m128i*)(tab + idx[1])), 1),


_mm_loadu_si128((const
__m128i*)(tab + idx[2])), 2),


_mm_loadu_si128((const
__m128i*)(tab + idx[3])), 3));


}


inline
v_uint64x8 v512_lut(const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v512_lut((const
int64 *)tab, idx)); }


inline
v_uint64x8 v512_lut_pairs(const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v512_lut_pairs((const
int64 *)tab, idx)); }


inline
v_float32x16 v512_lut(const
float* tab,
const
int* idx)


{


return
v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const
__m512i*)idx), tab, 4));


}


inline
v_float32x16 v512_lut_pairs(const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v512_lut_pairs((const
int
*)tab, idx)); }


inline
v_float32x16 v512_lut_quads(const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v512_lut_quads((const
int
*)tab, idx)); }


inline
v_float64x8 v512_lut(const
double* tab,
const
int* idx)


{


return
v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const
__m256i*)idx), tab, 8));


}


inline
v_float64x8 v512_lut_pairs(const
double* tab,
const
int* idx)


{


return
v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(


_mm_loadu_pd(tab + idx[0])),


_mm_loadu_pd(tab + idx[1]), 1),


_mm_loadu_pd(tab + idx[2]), 2),


_mm_loadu_pd(tab + idx[3]), 3));


}


inline
v_int32x16 v_lut(const
int* tab,
const
v_int32x16& idxvec)


{


return
v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));


}


inline
v_uint32x16 v_lut(const
unsigned* tab,
const
v_int32x16& idxvec)


{


return
v_reinterpret_as_u32(v_lut((const
int
*)tab, idxvec));


}


inline
v_float32x16 v_lut(const
float* tab,
const
v_int32x16& idxvec)


{


return
v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));


}


inline
v_float64x8 v_lut(const
double* tab,
const
v_int32x16& idxvec)


{


return
v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));


}


inline
void
v_lut_deinterleave(const
float* tab,
const
v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)


{


x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);


y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);


}


inline
void
v_lut_deinterleave(const
double* tab,
const
v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)


{


x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);


y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);


}


inline
v_int8x64 v_interleave_pairs(const
v_int8x64& vec)


{


return
v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));


}


inline
v_uint8x64 v_interleave_pairs(const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }


inline
v_int8x64 v_interleave_quads(const
v_int8x64& vec)


{


return
v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));


}


inline
v_uint8x64 v_interleave_quads(const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }


inline
v_int16x32 v_interleave_pairs(const
v_int16x32& vec)


{


return
v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));


}


inline
v_uint16x32 v_interleave_pairs(const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }


inline
v_int16x32 v_interleave_quads(const
v_int16x32& vec)


{


return
v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));


}


inline
v_uint16x32 v_interleave_quads(const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }


inline
v_int32x16 v_interleave_pairs(const
v_int32x16& vec)


{


return
v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));


}


inline
v_uint32x16 v_interleave_pairs(const
v_uint32x16& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }


inline
v_float32x16 v_interleave_pairs(const
v_float32x16& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }


inline
v_int8x64 v_pack_triplets(const
v_int8x64& vec)


{


return
v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,


0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),


_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));


}


inline
v_uint8x64 v_pack_triplets(const
v_uint8x64& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }


inline
v_int16x32 v_pack_triplets(const
v_int16x32& vec)


{


return
v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,


0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));


}


inline
v_uint16x32 v_pack_triplets(const
v_uint16x32& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }


inline
v_int32x16 v_pack_triplets(const
v_int32x16& vec)


{


return
v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,


0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));


}


inline
v_uint32x16 v_pack_triplets(const
v_uint32x16& vec) {
return
v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }


inline
v_float32x16 v_pack_triplets(const
v_float32x16& vec)


{


return
v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,


0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));


}


// 16 >> 32


inline
v_int32x16
v_dotprod(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_int32x16(_mm512_madd_epi16(a.val, b.val)); }


inline
v_int32x16
v_dotprod(const
v_int16x32& a,
const
v_int16x32& b,
const
v_int32x16& c)


{
return
v_dotprod(a, b) + c; }


// 32 >> 64


inline
v_int64x8
v_dotprod(const
v_int32x16& a,
const
v_int32x16& b)


{


__m512i even = _mm512_mul_epi32(a.val, b.val);


__m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));


return
v_int64x8(_mm512_add_epi64(even, odd));


}


inline
v_int64x8
v_dotprod(const
v_int32x16& a,
const
v_int32x16& b,
const
v_int64x8& c)


{
return
v_dotprod(a, b) + c; }


// 8 >> 32


inline
v_uint32x16
v_dotprod_expand(const
v_uint8x64& a,
const
v_uint8x64& b)


{


__m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());


__m512i odd_a  = _mm512_srli_epi16(a.val, 8);


__m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());


__m512i odd_b  = _mm512_srli_epi16(b.val, 8);


__m512i prod0  = _mm512_madd_epi16(even_a, even_b);


__m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);


return
v_uint32x16(_mm512_add_epi32(prod0, prod1));


}


inline
v_uint32x16
v_dotprod_expand(const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint32x16& c)


{
return
v_dotprod_expand(a, b) + c; }


inline
v_int32x16
v_dotprod_expand(const
v_int8x64& a,
const
v_int8x64& b)


{


__m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);


__m512i odd_a  = _mm512_srai_epi16(a.val, 8);


__m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);


__m512i odd_b  = _mm512_srai_epi16(b.val, 8);


__m512i prod0  = _mm512_madd_epi16(even_a, even_b);


__m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);


return
v_int32x16(_mm512_add_epi32(prod0, prod1));


}


inline
v_int32x16
v_dotprod_expand(const
v_int8x64& a,
const
v_int8x64& b,
const
v_int32x16& c)


{
return
v_dotprod_expand(a, b) + c; }


// 16 >> 64


inline
v_uint64x8
v_dotprod_expand(const
v_uint16x32& a,
const
v_uint16x32& b)


{


__m512i mullo = _mm512_mullo_epi16(a.val, b.val);


__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);


__m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);


__m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);


__m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());


__m512i p13   = _mm512_srli_epi64(mul0, 32);


__m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());


__m512i p57   = _mm512_srli_epi64(mul1, 32);


__m512i p15_  = _mm512_add_epi64(p02, p13);


__m512i p9d_  = _mm512_add_epi64(p46, p57);


return
v_uint64x8(_mm512_add_epi64(


_mm512_unpacklo_epi64(p15_, p9d_),


_mm512_unpackhi_epi64(p15_, p9d_)


));


}


inline
v_uint64x8
v_dotprod_expand(const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint64x8& c)


{
return
v_dotprod_expand(a, b) + c; }


inline
v_int64x8
v_dotprod_expand(const
v_int16x32& a,
const
v_int16x32& b)


{


__m512i prod = _mm512_madd_epi16(a.val, b.val);


__m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);


__m512i odd  = _mm512_srai_epi64(prod, 32);


return
v_int64x8(_mm512_add_epi64(even, odd));


}


inline
v_int64x8
v_dotprod_expand(const
v_int16x32& a,
const
v_int16x32& b,
const
v_int64x8& c)


{
return
v_dotprod_expand(a, b) + c; }


// 32 >> 64f


inline
v_float64x8
v_dotprod_expand(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_cvt_f64(v_dotprod(a, b)); }


inline
v_float64x8
v_dotprod_expand(const
v_int32x16& a,
const
v_int32x16& b,
const
v_float64x8& c)


{
return
v_dotprod_expand(a, b) + c; }


// 16 >> 32


inline
v_int32x16
v_dotprod_fast(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_dotprod(a, b); }


inline
v_int32x16
v_dotprod_fast(const
v_int16x32& a,
const
v_int16x32& b,
const
v_int32x16& c)


{
return
v_dotprod(a, b, c); }


// 32 >> 64


inline
v_int64x8
v_dotprod_fast(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_dotprod(a, b); }


inline
v_int64x8
v_dotprod_fast(const
v_int32x16& a,
const
v_int32x16& b,
const
v_int64x8& c)


{
return
v_dotprod(a, b, c); }


// 8 >> 32


inline
v_uint32x16
v_dotprod_expand_fast(const
v_uint8x64& a,
const
v_uint8x64& b)


{
return
v_dotprod_expand(a, b); }


inline
v_uint32x16
v_dotprod_expand_fast(const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint32x16& c)


{
return
v_dotprod_expand(a, b, c); }


inline
v_int32x16
v_dotprod_expand_fast(const
v_int8x64& a,
const
v_int8x64& b)


{
return
v_dotprod_expand(a, b); }


inline
v_int32x16
v_dotprod_expand_fast(const
v_int8x64& a,
const
v_int8x64& b,
const
v_int32x16& c)


{
return
v_dotprod_expand(a, b, c); }


// 16 >> 64


inline
v_uint64x8
v_dotprod_expand_fast(const
v_uint16x32& a,
const
v_uint16x32& b)


{


__m512i mullo = _mm512_mullo_epi16(a.val, b.val);


__m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);


__m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);


__m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);


__m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());


__m512i p13   = _mm512_srli_epi64(mul0, 32);


__m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());


__m512i p57   = _mm512_srli_epi64(mul1, 32);


__m512i p15_  = _mm512_add_epi64(p02, p13);


__m512i p9d_  = _mm512_add_epi64(p46, p57);


return
v_uint64x8(_mm512_add_epi64(p15_, p9d_));


}


inline
v_uint64x8
v_dotprod_expand_fast(const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint64x8& c)


{
return
v_dotprod_expand_fast(a, b) + c; }


inline
v_int64x8
v_dotprod_expand_fast(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_dotprod_expand(a, b); }


inline
v_int64x8
v_dotprod_expand_fast(const
v_int16x32& a,
const
v_int16x32& b,
const
v_int64x8& c)


{
return
v_dotprod_expand(a, b, c); }


// 32 >> 64f


inline
v_float64x8
v_dotprod_expand_fast(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_dotprod_expand(a, b); }


inline
v_float64x8
v_dotprod_expand_fast(const
v_int32x16& a,
const
v_int32x16& b,
const
v_float64x8& c)


{
return
v_dotprod_expand(a, b) + c; }


#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \


v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))


inline
v_float32x16
v_matmul(const
v_float32x16& v,


const
v_float32x16& m0,
const
v_float32x16& m1,


const
v_float32x16& m2,
const
v_float32x16& m3)


{


v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);


v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);


v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);


v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);


return
v_fma(v04, m0,
v_fma(v15, m1,
v_fma(v26, m2, v37 * m3)));


}


inline
v_float32x16
v_matmuladd(const
v_float32x16& v,


const
v_float32x16& m0,
const
v_float32x16& m1,


const
v_float32x16& m2,
const
v_float32x16& a)


{


v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);


v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);


v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);


return
v_fma(v04, m0,
v_fma(v15, m1,
v_fma(v26, m2, a)));


}


#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \


inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \


const _Tpvec& a2, const _Tpvec& a3,              \


_Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \


{                                                                           \


__m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \


__m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \


__m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \


__m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \


b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \


b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \


b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \


b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \


}


OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)


OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)


OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)


/* Expand */


#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \


inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \


{                                                               \


b0.val = intrin(_v512_extract_low(a.val));                  \


b1.val = intrin(_v512_extract_high(a.val));                 \


}                                                               \


inline _Tpwvec v_expand_low(const _Tpvec& a)                    \


{ return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \


inline _Tpwvec v_expand_high(const _Tpvec& a)                   \


{ return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \


inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \


{                                                               \


__m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \


return _Tpwvec(intrin(a));                                  \


}


OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)


OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)


OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)


OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,
short,    _mm512_cvtepi16_epi32)


OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,
unsigned, _mm512_cvtepu32_epi64)


OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,
int,      _mm512_cvtepi32_epi64)


#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \


inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \


{                                                        \


__m128i a = _mm_loadu_si128((const __m128i*)ptr);    \


return _Tpvec(intrin(a));                            \


}


OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)


OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)


/* pack */


// 16


inline
v_int8x64 v_pack(const
v_int16x32& a,
const
v_int16x32& b)


{
return
v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }


inline
v_uint8x64 v_pack(const
v_uint16x32& a,
const
v_uint16x32& b)


{


const
__m512i t = _mm512_set1_epi16(255);


return
v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));


}


inline
v_uint8x64 v_pack_u(const
v_int16x32& a,
const
v_int16x32& b)


{


return
v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));


}


inline
void
v_pack_store(schar* ptr,
const
v_int16x32& a)


{
v_store_low(ptr, v_pack(a, a)); }


inline
void
v_pack_store(uchar* ptr,
const
v_uint16x32& a)


{


const
__m512i m = _mm512_set1_epi16(255);


_mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));


}


inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x32& a)


{
v_store_low(ptr, v_pack_u(a, a)); }


template<int
n>
inline


v_uint8x64 v_rshr_pack(const
v_uint16x32& a,
const
v_uint16x32& b)


{


// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.


v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));


return
v_pack_u(v_reinterpret_as_s16((a + delta) >> n),


v_reinterpret_as_s16((b + delta) >> n));


}


template<int
n>
inline


void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x32& a)


{


v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));


v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));


}


template<int
n>
inline


v_uint8x64 v_rshr_pack_u(const
v_int16x32& a,
const
v_int16x32& b)


{


v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));


return
v_pack_u((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x32& a)


{


v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));


v_pack_u_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int8x64 v_rshr_pack(const
v_int16x32& a,
const
v_int16x32& b)


{


v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(schar* ptr,
const
v_int16x32& a)


{


v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));


v_pack_store(ptr, (a + delta) >> n);


}


// 32


inline
v_int16x32 v_pack(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }


inline
v_uint16x32 v_pack(const
v_uint32x16& a,
const
v_uint32x16& b)


{


const
__m512i m = _mm512_set1_epi32(65535);


return
v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));


}


inline
v_uint16x32 v_pack_u(const
v_int32x16& a,
const
v_int32x16& b)


{
return
v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }


inline
void
v_pack_store(short* ptr,
const
v_int32x16& a)


{
v_store_low(ptr, v_pack(a, a)); }


inline
void
v_pack_store(ushort* ptr,
const
v_uint32x16& a)


{


const
__m512i m = _mm512_set1_epi32(65535);


_mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));


}


inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x16& a)


{
v_store_low(ptr, v_pack_u(a, a)); }


template<int
n>
inline


v_uint16x32 v_rshr_pack(const
v_uint32x16& a,
const
v_uint32x16& b)


{


v_uint32x16 delta = v512_setall_u32(1 << (n-1));


return
v_pack_u(v_reinterpret_as_s32((a + delta) >> n),


v_reinterpret_as_s32((b + delta) >> n));


}


template<int
n>
inline


void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x16& a)


{


v_uint32x16 delta = v512_setall_u32(1 << (n-1));


v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));


}


template<int
n>
inline


v_uint16x32 v_rshr_pack_u(const
v_int32x16& a,
const
v_int32x16& b)


{


v_int32x16 delta = v512_setall_s32(1 << (n-1));


return
v_pack_u((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x16& a)


{


v_int32x16 delta = v512_setall_s32(1 << (n-1));


v_pack_u_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int16x32 v_rshr_pack(const
v_int32x16& a,
const
v_int32x16& b)


{


v_int32x16 delta = v512_setall_s32(1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(short* ptr,
const
v_int32x16& a)


{


v_int32x16 delta = v512_setall_s32(1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


// 64


// Non-saturating pack


inline
v_uint32x16 v_pack(const
v_uint64x8& a,
const
v_uint64x8& b)


{
return
v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }


inline
v_int32x16 v_pack(const
v_int64x8& a,
const
v_int64x8& b)


{
return
v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }


inline
void
v_pack_store(unsigned* ptr,
const
v_uint64x8& a)


{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }


inline
void
v_pack_store(int* ptr,
const
v_int64x8& b)


{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }


template<int
n>
inline


v_uint32x16 v_rshr_pack(const
v_uint64x8& a,
const
v_uint64x8& b)


{


v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(unsigned* ptr,
const
v_uint64x8& a)


{


v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int32x16 v_rshr_pack(const
v_int64x8& a,
const
v_int64x8& b)


{


v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(int* ptr,
const
v_int64x8& a)


{


v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


// pack boolean


inline
v_uint8x64
v_pack_b(const
v_uint16x32& a,
const
v_uint16x32& b)


{
return
v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }


inline
v_uint8x64
v_pack_b(const
v_uint32x16& a,
const
v_uint32x16& b,


const
v_uint32x16& c,
const
v_uint32x16& d)


{


__m512i ab = _mm512_packs_epi32(a.val, b.val);


__m512i cd = _mm512_packs_epi32(c.val, d.val);


return
v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));


}


inline
v_uint8x64
v_pack_b(const
v_uint64x8& a,
const
v_uint64x8& b,
const
v_uint64x8& c,


const
v_uint64x8& d,
const
v_uint64x8& e,
const
v_uint64x8& f,


const
v_uint64x8& g,
const
v_uint64x8& h)


{


__m512i ab = _mm512_packs_epi32(a.val, b.val);


__m512i cd = _mm512_packs_epi32(c.val, d.val);


__m512i ef = _mm512_packs_epi32(e.val, f.val);


__m512i gh = _mm512_packs_epi32(g.val, h.val);


__m512i abcd = _mm512_packs_epi32(ab, cd);


__m512i efgh = _mm512_packs_epi32(ef, gh);


return
v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,


27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));


}


/* Recombine */


// its up there with load and store operations


/* Extract */


#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \


template<int s>                                           \


inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \


{ return v_rotate_right<s>(a, b); }


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)


#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \


template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32,
short)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16,
int)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16,
float)


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8,
double)


template<int
i>


inline
v_uint32x16
v_broadcast_element(v_uint32x16 a)


{


static
const
__m512i perm = _mm512_set1_epi32((char)i);


return
v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));


}


template<int
i>


inline
v_int32x16
v_broadcast_element(const
v_int32x16 &a)


{
return
v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


template<int
i>


inline
v_float32x16
v_broadcast_element(const
v_float32x16 &a)


{
return
v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x64& a, v_uint8x64& b )


{


__m512i ab0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i ab1 = _mm512_loadu_si512((const
__m512i*)(ptr + 64));


#if CV_AVX_512VBMI


__m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,


94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,


62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,


30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);


__m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,


95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,


63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,


31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);


a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));


b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));


#else


__m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);


__m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);


__m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);


__m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);


a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));


b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));


#endif


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x32& a, v_uint16x32& b )


{


__m512i ab0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i ab1 = _mm512_loadu_si512((const
__m512i*)(ptr + 32));


__m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,


30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);


__m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,


31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);


a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));


b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x16& a, v_uint32x16& b )


{


__m512i ab0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i ab1 = _mm512_loadu_si512((const
__m512i*)(ptr + 16));


__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);


a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));


b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x8& a, v_uint64x8& b )


{


__m512i ab0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i ab1 = _mm512_loadu_si512((const
__m512i*)(ptr + 8));


__m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);


a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));


b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));


}


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )


{


__m512i bgr0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgr1 = _mm512_loadu_si512((const
__m512i*)(ptr + 64));


__m512i bgr2 = _mm512_loadu_si512((const
__m512i*)(ptr + 128));


#if CV_AVX_512VBMI2


__m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,


78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,


30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,


47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);


__m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);


__m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);


__m512i r12b2 = _mm512_permutex2var_epi8(bgr1,


_v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,


77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,


94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,


46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);


a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));


b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));


c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));


#elif CV_AVX_512VBMI


__m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);


__m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);


__m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);


a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,


77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,


46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,


23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));


b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,


39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,


15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,


111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));


c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,


15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,


95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,


47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));


#else


__m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,


45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);


__m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);


__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);


__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);


__m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);


__m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,


14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);


__m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);


a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));


c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));


b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));


#endif


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )


{


__m512i bgr0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgr1 = _mm512_loadu_si512((const
__m512i*)(ptr + 32));


__m512i bgr2 = _mm512_loadu_si512((const
__m512i*)(ptr + 64));


__m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,


45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);


__m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);


__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);


__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);


a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));


b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,


14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));


c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )


{


__m512i bgr0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgr1 = _mm512_loadu_si512((const
__m512i*)(ptr + 16));


__m512i bgr2 = _mm512_loadu_si512((const
__m512i*)(ptr + 32));


__m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);


__m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);


__m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);


__m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);


a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));


b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));


c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )


{


__m512i bgr0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgr1 = _mm512_loadu_si512((const
__m512i*)(ptr + 8));


__m512i bgr2 = _mm512_loadu_si512((const
__m512i*)(ptr + 16));


__m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);


__m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);


__m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);


__m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);


a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));


c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));


b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));


}


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )


{


__m512i bgra0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgra1 = _mm512_loadu_si512((const
__m512i*)(ptr + 64));


__m512i bgra2 = _mm512_loadu_si512((const
__m512i*)(ptr + 128));


__m512i bgra3 = _mm512_loadu_si512((const
__m512i*)(ptr + 192));


#if CV_AVX_512VBMI


__m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,


94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,


62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,


30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);


__m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,


95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,


63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,


31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);


__m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);


__m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);


__m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);


__m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);


a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));


c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));


b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));


d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));


#else


__m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);


__m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);


__m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);


__m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);


__m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);


__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);


__m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);


__m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);


__m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);


__m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);


a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));


c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));


b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));


d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));


#endif


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )


{


__m512i bgra0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgra1 = _mm512_loadu_si512((const
__m512i*)(ptr + 32));


__m512i bgra2 = _mm512_loadu_si512((const
__m512i*)(ptr + 64));


__m512i bgra3 = _mm512_loadu_si512((const
__m512i*)(ptr + 96));


__m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,


30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);


__m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,


31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);


__m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);


__m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);


__m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);


__m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);


a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));


c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));


b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));


d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )


{


__m512i bgra0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgra1 = _mm512_loadu_si512((const
__m512i*)(ptr + 16));


__m512i bgra2 = _mm512_loadu_si512((const
__m512i*)(ptr + 32));


__m512i bgra3 = _mm512_loadu_si512((const
__m512i*)(ptr + 48));


__m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);


__m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);


__m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);


__m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);


__m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);


a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));


c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));


b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));


d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )


{


__m512i bgra0 = _mm512_loadu_si512((const
__m512i*)ptr);


__m512i bgra1 = _mm512_loadu_si512((const
__m512i*)(ptr + 8));


__m512i bgra2 = _mm512_loadu_si512((const
__m512i*)(ptr + 16));


__m512i bgra3 = _mm512_loadu_si512((const
__m512i*)(ptr + 24));


__m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);


__m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);


__m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);


__m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);


__m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);


__m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);


a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));


c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));


b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));


d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& x,
const
v_uint8x64& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint8x64 low, high;


v_zip(x, y, low, high);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, low.val);


_mm512_stream_si512((__m512i*)(ptr + 64), high.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, low.val);


_mm512_store_si512((__m512i*)(ptr + 64), high.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, low.val);


_mm512_storeu_si512((__m512i*)(ptr + 64), high.val);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& x,
const
v_uint16x32& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint16x32 low, high;


v_zip(x, y, low, high);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, low.val);


_mm512_stream_si512((__m512i*)(ptr + 32), high.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, low.val);


_mm512_store_si512((__m512i*)(ptr + 32), high.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, low.val);


_mm512_storeu_si512((__m512i*)(ptr + 32), high.val);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& x,
const
v_uint32x16& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint32x16 low, high;


v_zip(x, y, low, high);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, low.val);


_mm512_stream_si512((__m512i*)(ptr + 16), high.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, low.val);


_mm512_store_si512((__m512i*)(ptr + 16), high.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, low.val);


_mm512_storeu_si512((__m512i*)(ptr + 16), high.val);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& x,
const
v_uint64x8& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint64x8 low, high;


v_zip(x, y, low, high);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, low.val);


_mm512_stream_si512((__m512i*)(ptr + 8), high.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, low.val);


_mm512_store_si512((__m512i*)(ptr + 8), high.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, low.val);


_mm512_storeu_si512((__m512i*)(ptr + 8), high.val);


}


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& a,
const
v_uint8x64& b,
const
v_uint8x64& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


#if CV_AVX_512VBMI


__m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,


79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,


10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,


111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);


__m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,


37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,


95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,


5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);


__m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,


122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,


53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,


90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);


__m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);


__m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);


__m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);


__m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);


__m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);


__m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);


#else


__m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));


__m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);


__m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);


__m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);


__m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,


5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);


__m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,


47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);


__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,


26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);


__m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);


__m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);


__m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);


__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);


__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);


__m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);


#endif


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgr0);


_mm512_stream_si512((__m512i*)(ptr + 64), bgr1);


_mm512_stream_si512((__m512i*)(ptr + 128), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgr0);


_mm512_store_si512((__m512i*)(ptr + 64), bgr1);


_mm512_store_si512((__m512i*)(ptr + 128), bgr2);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgr0);


_mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);


_mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& a,
const
v_uint16x32& b,
const
v_uint16x32& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,


5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);


__m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,


47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);


__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,


26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);


__m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);


__m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);


__m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);


__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);


__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);


__m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgr0);


_mm512_stream_si512((__m512i*)(ptr + 32), bgr1);


_mm512_stream_si512((__m512i*)(ptr + 64), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgr0);


_mm512_store_si512((__m512i*)(ptr + 32), bgr1);


_mm512_store_si512((__m512i*)(ptr + 64), bgr2);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgr0);


_mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);


_mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& a,
const
v_uint32x16& b,
const
v_uint32x16& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);


__m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);


__m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);


__m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);


__m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);


__m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);


__m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgr0);


_mm512_stream_si512((__m512i*)(ptr + 16), bgr1);


_mm512_stream_si512((__m512i*)(ptr + 32), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgr0);


_mm512_store_si512((__m512i*)(ptr + 16), bgr1);


_mm512_store_si512((__m512i*)(ptr + 32), bgr2);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgr0);


_mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);


_mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& a,
const
v_uint64x8& b,
const
v_uint64x8& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);


__m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);


__m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);


__m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);


__m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);


__m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);


__m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgr0);


_mm512_stream_si512((__m512i*)(ptr + 8), bgr1);


_mm512_stream_si512((__m512i*)(ptr + 16), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgr0);


_mm512_store_si512((__m512i*)(ptr + 8), bgr1);


_mm512_store_si512((__m512i*)(ptr + 16), bgr2);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgr0);


_mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);


_mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);


}


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x64& a,
const
v_uint8x64& b,


const
v_uint8x64& c,
const
v_uint8x64& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint8x64 br01, br23, ga01, ga23;


v_zip(a, c, br01, br23);


v_zip(b, d, ga01, ga23);


v_uint8x64 bgra0, bgra1, bgra2, bgra3;


v_zip(br01, ga01, bgra0, bgra1);


v_zip(br23, ga23, bgra2, bgra3);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgra0.val);


_mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);


_mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);


_mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgra0.val);


_mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);


_mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);


_mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgra0.val);


_mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);


_mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);


_mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x32& a,
const
v_uint16x32& b,


const
v_uint16x32& c,
const
v_uint16x32& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint16x32 br01, br23, ga01, ga23;


v_zip(a, c, br01, br23);


v_zip(b, d, ga01, ga23);


v_uint16x32 bgra0, bgra1, bgra2, bgra3;


v_zip(br01, ga01, bgra0, bgra1);


v_zip(br23, ga23, bgra2, bgra3);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgra0.val);


_mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);


_mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);


_mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgra0.val);


_mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);


_mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);


_mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgra0.val);


_mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);


_mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);


_mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x16& a,
const
v_uint32x16& b,


const
v_uint32x16& c,
const
v_uint32x16& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint32x16 br01, br23, ga01, ga23;


v_zip(a, c, br01, br23);


v_zip(b, d, ga01, ga23);


v_uint32x16 bgra0, bgra1, bgra2, bgra3;


v_zip(br01, ga01, bgra0, bgra1);


v_zip(br23, ga23, bgra2, bgra3);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgra0.val);


_mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);


_mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);


_mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgra0.val);


_mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);


_mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);


_mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgra0.val);


_mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);


_mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);


_mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x8& a,
const
v_uint64x8& b,


const
v_uint64x8& c,
const
v_uint64x8& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


v_uint64x8 br01, br23, ga01, ga23;


v_zip(a, c, br01, br23);


v_zip(b, d, ga01, ga23);


v_uint64x8 bgra0, bgra1, bgra2, bgra3;


v_zip(br01, ga01, bgra0, bgra1);


v_zip(br23, ga23, bgra2, bgra3);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm512_stream_si512((__m512i*)ptr, bgra0.val);


_mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);


_mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);


_mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm512_store_si512((__m512i*)ptr, bgra0.val);


_mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);


_mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);


_mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);


}


else


{


_mm512_storeu_si512((__m512i*)ptr, bgra0.val);


_mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);


_mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);


_mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);


}


}


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \


{ \


_Tpvec1 a1, b1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


} \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \


{ \


_Tpvec1 a1, b1, c1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


c0 = v_reinterpret_as_##suffix0(c1); \


} \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \


{ \


_Tpvec1 a1, b1, c1, d1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


c0 = v_reinterpret_as_##suffix0(c1); \


d0 = v_reinterpret_as_##suffix0(d1); \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \


v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \


const _Tpvec0& c0, const _Tpvec0& d0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \


_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \


v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \


}


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32,
short, s16, v_uint16x32, ushort, u16)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16,
int, s32, v_uint32x16,
unsigned, u32)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16,
float, f32, v_uint32x16,
unsigned, u32)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8,
double, f64, v_uint64x8, uint64, u64)


inline
int64
v_signmask(const
v_int8x64& a) {
return
(int64)_mm512_movepi8_mask(a.val); }


inline
int
v_signmask(const
v_int16x32& a) {
return
(int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
int
v_signmask(const
v_int32x16& a) {
return
(int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
int
v_signmask(const
v_int64x8& a) {
return
(int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
int64
v_signmask(const
v_uint8x64& a) {
return
v_signmask(v_reinterpret_as_s8(a)); }


inline
int
v_signmask(const
v_uint16x32& a) {
return
v_signmask(v_reinterpret_as_s16(a)); }


inline
int
v_signmask(const
v_uint32x16& a) {
return
v_signmask(v_reinterpret_as_s32(a)); }


inline
int
v_signmask(const
v_uint64x8& a) {
return
v_signmask(v_reinterpret_as_s64(a)); }


inline
int
v_signmask(const
v_float32x16& a) {
return
v_signmask(v_reinterpret_as_s32(a)); }


inline
int
v_signmask(const
v_float64x8& a) {
return
v_signmask(v_reinterpret_as_s64(a)); }


inline
bool
v_check_all(const
v_int8x64& a) {
return
!(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }


inline
bool
v_check_any(const
v_int8x64& a) {
return
(bool)_mm512_movepi8_mask(a.val); }


inline
bool
v_check_all(const
v_int16x32& a) {
return
!(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }


inline
bool
v_check_any(const
v_int16x32& a) {
return
(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
bool
v_check_all(const
v_int32x16& a) {
return
!(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }


inline
bool
v_check_any(const
v_int32x16& a) {
return
(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
bool
v_check_all(const
v_int64x8& a) {
return
!(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }


inline
bool
v_check_any(const
v_int64x8& a) {
return
(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline
bool
v_check_all(const
v_float32x16& a) {
return
v_check_all(v_reinterpret_as_s32(a)); }


inline
bool
v_check_any(const
v_float32x16& a) {
return
v_check_any(v_reinterpret_as_s32(a)); }


inline
bool
v_check_all(const
v_float64x8& a) {
return
v_check_all(v_reinterpret_as_s64(a)); }


inline
bool
v_check_any(const
v_float64x8& a) {
return
v_check_any(v_reinterpret_as_s64(a)); }


inline
bool
v_check_all(const
v_uint8x64& a) {
return
v_check_all(v_reinterpret_as_s8(a)); }


inline
bool
v_check_all(const
v_uint16x32& a) {
return
v_check_all(v_reinterpret_as_s16(a)); }


inline
bool
v_check_all(const
v_uint32x16& a) {
return
v_check_all(v_reinterpret_as_s32(a)); }


inline
bool
v_check_all(const
v_uint64x8& a) {
return
v_check_all(v_reinterpret_as_s64(a)); }


inline
bool
v_check_any(const
v_uint8x64& a) {
return
v_check_any(v_reinterpret_as_s8(a)); }


inline
bool
v_check_any(const
v_uint16x32& a) {
return
v_check_any(v_reinterpret_as_s16(a)); }


inline
bool
v_check_any(const
v_uint32x16& a) {
return
v_check_any(v_reinterpret_as_s32(a)); }


inline
bool
v_check_any(const
v_uint64x8& a) {
return
v_check_any(v_reinterpret_as_s64(a)); }


inline
int
v_scan_forward(const
v_int8x64& a)


{


int64 mask = _mm512_movepi8_mask(a.val);


int
mask32 = (int)mask;


return
mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;


}


inline
int
v_scan_forward(const
v_uint8x64& a) {
return
v_scan_forward(v_reinterpret_as_s8(a)); }


inline
int
v_scan_forward(const
v_int16x32& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }


inline
int
v_scan_forward(const
v_uint16x32& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }


inline
int
v_scan_forward(const
v_int32x16& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }


inline
int
v_scan_forward(const
v_uint32x16& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }


inline
int
v_scan_forward(const
v_float32x16& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }


inline
int
v_scan_forward(const
v_int64x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }


inline
int
v_scan_forward(const
v_uint64x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }


inline
int
v_scan_forward(const
v_float64x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }


inline
void
v512_cleanup() { _mm256_zeroall(); }


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END


}
// cv::


#endif

// OPENCV_HAL_INTRIN_AVX_HPP


cv::max

CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)

Calculates per-element maximum of two arrays or an array and a scalar.


cv::min

CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)

Calculates per-element minimum of two arrays or an array and a scalar.


cv::v_check_any

bool v_check_any(const v_reg< _Tp, n > &a)

Check if any of packed values is less than zero


Definition:
intrin_cpp.hpp:1436


cv::v_matmul

v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Matrix multiplication


Definition:
intrin_cpp.hpp:3196


cv::v_round

v_reg< int, n > v_round(const v_reg< float, n > &a)

Round elements


Definition:
intrin_cpp.hpp:2427


cv::v_signmask

int v_signmask(const v_reg< _Tp, n > &a)

Get negative values mask


Definition:
intrin_cpp.hpp:1395


cv::v_zip

void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)

Interleave two vectors


Definition:
intrin_cpp.hpp:1557


cv::v_dotprod_expand

v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Dot product of elements and expand


Definition:
intrin_cpp.hpp:1145


cv::v_reduce_sad

V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Sum absolute differences of values


Definition:
intrin_cpp.hpp:1377


cv::v_ceil

v_reg< int, n > v_ceil(const v_reg< float, n > &a)

Ceil elements


Definition:
intrin_cpp.hpp:2465


cv::v_store_low

void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)

Store data to memory (lower half)


Definition:
intrin_cpp.hpp:2219


cv::v_floor

v_reg< int, n > v_floor(const v_reg< float, n > &a)

Floor elements


Definition:
intrin_cpp.hpp:2452


cv::v_dotprod

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Dot product of elements


Definition:
intrin_cpp.hpp:1080


cv::v_scan_forward

int v_scan_forward(const v_reg< _Tp, n > &a)

Get first negative lane index


Definition:
intrin_cpp.hpp:1412


cv::v_reverse

v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)

Vector reverse order


Definition:
intrin_cpp.hpp:2346


cv::v_absdiff

v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Add values without saturation


Definition:
intrin_cpp.hpp:956


cv::v_reduce_sum

V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)

Element shift left among vector


Definition:
intrin_cpp.hpp:1338


cv::v_muladd

v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)

A synonym for v_fma


Definition:
intrin_cpp.hpp:1060


cv::v_trunc

v_reg< int, n > v_trunc(const v_reg< float, n > &a)

Truncate elements


Definition:
intrin_cpp.hpp:2478


cv::v_invsqrt

v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)

Inversed square root


Definition:
intrin_cpp.hpp:1010


cv::v_dotprod_expand_fast

v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Fast Dot product of elements and expand


Definition:
intrin_cpp.hpp:1188


cv::v_cvt_f64_high

CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)

Convert to double high part of vector


Definition:
intrin_cpp.hpp:2587


cv::v_reduce_sum4

v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Sums all elements of each input vector, returns the vector of sums


Definition:
intrin_cpp.hpp:1356


cv::v_mul_expand

void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)

Multiply and expand


Definition:
intrin_cpp.hpp:1219


cv::v_broadcast_element

v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)

Broadcast i-th element of vector


Definition:
intrin_cpp.hpp:2416


cv::v_expand_low

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)

Expand lower values to the wider pack type


Definition:
intrin_cpp.hpp:1499


cv::v_cvt_f64

CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)

Convert lower half to double


Definition:
intrin_cpp.hpp:2576


cv::v_expand

void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)

Expand values to the wider pack type


Definition:
intrin_cpp.hpp:1477


cv::v_pack_b

v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)

! For 16-bit boolean values


Definition:
intrin_cpp.hpp:3114


cv::v_fma

v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)

Multiply and add


Definition:
intrin_cpp.hpp:1049


cv::v_store_interleave

void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)

Interleave and store (2 channels)


Definition:
intrin_cpp.hpp:2118


cv::v_absdiffs

v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Saturating absolute difference


Definition:
intrin_cpp.hpp:997


cv::v_expand_high

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)

Expand higher values to the wider pack type


Definition:
intrin_cpp.hpp:1518


cv::v_dotprod_fast

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Fast Dot product of elements


Definition:
intrin_cpp.hpp:1119


cv::v_mul_hi

v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Multiply and extract high part


Definition:
intrin_cpp.hpp:1236


cv::v_cvt_f32

v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)

Convert to float


Definition:
intrin_cpp.hpp:2537


cv::v_check_all

bool v_check_all(const v_reg< _Tp, n > &a)

Check if all packed values are less than zero


Definition:
intrin_cpp.hpp:1424


cv::v_matmuladd

v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Matrix multiplication and add


Definition:
intrin_cpp.hpp:3226


cv::v_not_nan

v_reg< float, n > v_not_nan(const v_reg< float, n > &a)

Less-than comparison


Definition:
intrin_cpp.hpp:893


cv::v_popcount

v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)

Count the 1 bits in the vector lanes and return result as corresponding unsigned type


Definition:
intrin_cpp.hpp:827


cv::v_load_deinterleave

void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)

Load and deinterleave (2 channels)


Definition:
intrin_cpp.hpp:2046


cv

"black box" representation of the file storage associated with a file on disk.


Definition:
aruco.hpp:75