cpp/ja/intrin__avx_8hpp_source.html

// This file is part of OpenCV project.


// It is subject to the license terms in the LICENSE file found in the top-level directory


// of this distribution and at http://opencv.org/license.html


#ifndef OPENCV_HAL_INTRIN_AVX_HPP


#define OPENCV_HAL_INTRIN_AVX_HPP


#define CV_SIMD256 1


#define CV_SIMD256_64F 1


#define CV_SIMD256_FP16 0

// no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)


namespace

cv


{


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN


inline
__m256i _v256_combine(const
__m128i& lo,
const
__m128i& hi)


{
return
_mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }


inline
__m256 _v256_combine(const
__m128& lo,
const
__m128& hi)


{
return
_mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }


inline
__m256d _v256_combine(const
__m128d& lo,
const
__m128d& hi)


{
return
_mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }


inline
int
_v_cvtsi256_si32(const
__m256i& a)


{
return
_mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }


inline
__m256i _v256_shuffle_odd_64(const
__m256i& v)


{
return
_mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }


inline
__m256d _v256_shuffle_odd_64(const
__m256d& v)


{
return
_mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }


template<int
imm>


inline
__m256i _v256_permute2x128(const
__m256i& a,
const
__m256i& b)


{
return
_mm256_permute2x128_si256(a, b, imm); }


template<int
imm>


inline
__m256 _v256_permute2x128(const
__m256& a,
const
__m256& b)


{
return
_mm256_permute2f128_ps(a, b, imm); }


template<int
imm>


inline
__m256d _v256_permute2x128(const
__m256d& a,
const
__m256d& b)


{
return
_mm256_permute2f128_pd(a, b, imm); }


template<int
imm,
typename
_Tpvec>


inline
_Tpvec v256_permute2x128(const
_Tpvec& a,
const
_Tpvec& b)


{
return
_Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }


template<int
imm>


inline
__m256i _v256_permute4x64(const
__m256i& a)


{
return
_mm256_permute4x64_epi64(a, imm); }


template<int
imm>


inline
__m256d _v256_permute4x64(const
__m256d& a)


{
return
_mm256_permute4x64_pd(a, imm); }


template<int
imm,
typename
_Tpvec>


inline
_Tpvec v256_permute4x64(const
_Tpvec& a)


{
return
_Tpvec(_v256_permute4x64<imm>(a.val)); }


inline
__m128i _v256_extract_high(const
__m256i& v)


{
return
_mm256_extracti128_si256(v, 1); }


inline
__m128  _v256_extract_high(const
__m256& v)


{
return
_mm256_extractf128_ps(v, 1); }


inline
__m128d _v256_extract_high(const
__m256d& v)


{
return
_mm256_extractf128_pd(v, 1); }


inline
__m128i _v256_extract_low(const
__m256i& v)


{
return
_mm256_castsi256_si128(v); }


inline
__m128  _v256_extract_low(const
__m256& v)


{
return
_mm256_castps256_ps128(v); }


inline
__m128d _v256_extract_low(const
__m256d& v)


{
return
_mm256_castpd256_pd128(v); }


inline
__m256i _v256_packs_epu32(const
__m256i& a,
const
__m256i& b)


{


const
__m256i m = _mm256_set1_epi32(65535);


__m256i am = _mm256_min_epu32(a, m);


__m256i bm = _mm256_min_epu32(b, m);


return
_mm256_packus_epi32(am, bm);


}


template<int
i>


inline
int
_v256_extract_epi8(const
__m256i& a)


{


#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
/*MSVS 2017*/
))


return
_mm256_extract_epi8(a, i);


#else


__m128i b = _mm256_extractf128_si256(a, ((i) >> 4));


return
_mm_extract_epi8(b, i & 15);
// SSE4.1


#endif


}


template<int
i>


inline
int
_v256_extract_epi16(const
__m256i& a)


{


#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
/*MSVS 2017*/
))


return
_mm256_extract_epi16(a, i);


#else


__m128i b = _mm256_extractf128_si256(a, ((i) >> 3));


return
_mm_extract_epi16(b, i & 7);
// SSE2


#endif


}


template<int
i>


inline
int
_v256_extract_epi32(const
__m256i& a)


{


#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
/*MSVS 2017*/
))


return
_mm256_extract_epi32(a, i);


#else


__m128i b = _mm256_extractf128_si256(a, ((i) >> 2));


return
_mm_extract_epi32(b, i & 3);
// SSE4.1


#endif


}


template<int
i>


inline
int64 _v256_extract_epi64(const
__m256i& a)


{


#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
/*MSVS 2017*/
))


return
_mm256_extract_epi64(a, i);


#else


__m128i b = _mm256_extractf128_si256(a, ((i) >> 1));


return
_mm_extract_epi64(b, i & 1);
// SSE4.1


#endif


}


struct
v_uint8x32


{


typedef
uchar lane_type;


enum
{ nlanes = 32 };


__m256i val;


explicit
v_uint8x32(__m256i v) : val(v) {}


v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,


uchar v4,  uchar v5,  uchar v6,  uchar v7,


uchar v8,  uchar v9,  uchar v10, uchar v11,


uchar v12, uchar v13, uchar v14, uchar v15,


uchar v16, uchar v17, uchar v18, uchar v19,


uchar v20, uchar v21, uchar v22, uchar v23,


uchar v24, uchar v25, uchar v26, uchar v27,


uchar v28, uchar v29, uchar v30, uchar v31)


{


val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,


(char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,


(char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,


(char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,


(char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,


(char)v28, (char)v29, (char)v30, (char)v31);


}


/* coverity[uninit_ctor]: suppress warning */


v_uint8x32() {}


uchar get0()
const
{
return
(uchar)_v_cvtsi256_si32(val); }


};


struct
v_int8x32


{


typedef
schar lane_type;


enum
{ nlanes = 32 };


__m256i val;


explicit
v_int8x32(__m256i v) : val(v) {}


v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,


schar v4,  schar v5,  schar v6,  schar v7,


schar v8,  schar v9,  schar v10, schar v11,


schar v12, schar v13, schar v14, schar v15,


schar v16, schar v17, schar v18, schar v19,


schar v20, schar v21, schar v22, schar v23,


schar v24, schar v25, schar v26, schar v27,


schar v28, schar v29, schar v30, schar v31)


{


val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,


v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,


v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);


}


/* coverity[uninit_ctor]: suppress warning */


v_int8x32() {}


schar get0()
const
{
return
(schar)_v_cvtsi256_si32(val); }


};


struct
v_uint16x16


{


typedef
ushort lane_type;


enum
{ nlanes = 16 };


__m256i val;


explicit
v_uint16x16(__m256i v) : val(v) {}


v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,


ushort v4,  ushort v5,  ushort v6,  ushort v7,


ushort v8,  ushort v9,  ushort v10, ushort v11,


ushort v12, ushort v13, ushort v14, ushort v15)


{


val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,


(short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,


(short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);


}


/* coverity[uninit_ctor]: suppress warning */


v_uint16x16() {}


ushort get0()
const
{
return
(ushort)_v_cvtsi256_si32(val); }


};


struct
v_int16x16


{


typedef
short
lane_type;


enum
{ nlanes = 16 };


__m256i val;


explicit
v_int16x16(__m256i v) : val(v) {}


v_int16x16(short
v0,
short
v1,
short
v2,
short
v3,


short
v4,
short
v5,
short
v6,
short
v7,


short
v8,
short
v9,
short
v10,
short
v11,


short
v12,
short
v13,
short
v14,
short
v15)


{


val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,


v8, v9, v10, v11, v12, v13, v14, v15);


}


/* coverity[uninit_ctor]: suppress warning */


v_int16x16() {}


short
get0()
const
{
return
(short)_v_cvtsi256_si32(val); }


};


struct
v_uint32x8


{


typedef
unsigned
lane_type;


enum
{ nlanes = 8 };


__m256i val;


explicit
v_uint32x8(__m256i v) : val(v) {}


v_uint32x8(unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3,


unsigned
v4,
unsigned
v5,
unsigned
v6,
unsigned
v7)


{


val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,


(unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);


}


/* coverity[uninit_ctor]: suppress warning */


v_uint32x8() {}


unsigned
get0()
const
{
return
(unsigned)_v_cvtsi256_si32(val); }


};


struct
v_int32x8


{


typedef
int
lane_type;


enum
{ nlanes = 8 };


__m256i val;


explicit
v_int32x8(__m256i v) : val(v) {}


v_int32x8(int
v0,
int
v1,
int
v2,
int
v3,


int
v4,
int
v5,
int
v6,
int
v7)


{


val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);


}


/* coverity[uninit_ctor]: suppress warning */


v_int32x8() {}


int
get0()
const
{
return
_v_cvtsi256_si32(val); }


};


struct
v_float32x8


{


typedef
float
lane_type;


enum
{ nlanes = 8 };


__m256 val;


explicit
v_float32x8(__m256 v) : val(v) {}


v_float32x8(float
v0,
float
v1,
float
v2,
float
v3,


float
v4,
float
v5,
float
v6,
float
v7)


{


val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);


}


/* coverity[uninit_ctor]: suppress warning */


v_float32x8() {}


float
get0()
const
{
return
_mm_cvtss_f32(_mm256_castps256_ps128(val)); }


};


struct
v_uint64x4


{


typedef
uint64 lane_type;


enum
{ nlanes = 4 };


__m256i val;


explicit
v_uint64x4(__m256i v) : val(v) {}


v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)


{ val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }


/* coverity[uninit_ctor]: suppress warning */


v_uint64x4() {}


uint64 get0()
const


{


#if defined __x86_64__ || defined _M_X64


return
(uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));


#else


int
a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));


int
b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));


return
(unsigned)a | ((uint64)(unsigned)b << 32);


#endif


}


};


struct
v_int64x4


{


typedef
int64 lane_type;


enum
{ nlanes = 4 };


__m256i val;


explicit
v_int64x4(__m256i v) : val(v) {}


v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)


{ val = _mm256_setr_epi64x(v0, v1, v2, v3); }


/* coverity[uninit_ctor]: suppress warning */


v_int64x4() {}


int64 get0()
const


{


#if defined __x86_64__ || defined _M_X64


return
(int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));


#else


int
a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));


int
b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));


return
(int64)((unsigned)a | ((uint64)(unsigned)b << 32));


#endif


}


};


struct
v_float64x4


{


typedef
double
lane_type;


enum
{ nlanes = 4 };


__m256d val;


explicit
v_float64x4(__m256d v) : val(v) {}


v_float64x4(double
v0,
double
v1,
double
v2,
double
v3)


{ val = _mm256_setr_pd(v0, v1, v2, v3); }


/* coverity[uninit_ctor]: suppress warning */


v_float64x4() {}


double
get0()
const
{
return
_mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }


};


#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \


inline _Tpvec v256_load(const _Tp* ptr)                           \


{ return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \


inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \


{ return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \


inline _Tpvec v256_load_low(const _Tp* ptr)                       \


{                                                                 \


__m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \


return _Tpvec(_mm256_castsi128_si256(v128));                  \


}                                                                 \


inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \


{                                                                 \


__m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \


__m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \


return _Tpvec(_v256_combine(vlo, vhi));                       \


}                                                                 \


inline void v_store(_Tp* ptr, const _Tpvec& a)                    \


{ _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \


inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \


{ _mm256_store_si256((__m256i*)ptr, a.val); }                     \


inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \


{ _mm256_stream_si256((__m256i*)ptr, a.val); }                    \


inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \


{ \


if( mode == hal::STORE_UNALIGNED ) \


_mm256_storeu_si256((__m256i*)ptr, a.val); \


else if( mode == hal::STORE_ALIGNED_NOCACHE )  \


_mm256_stream_si256((__m256i*)ptr, a.val); \


else \


_mm256_store_si256((__m256i*)ptr, a.val); \


} \


inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \


{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \


inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \


{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,
short)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,
unsigned)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,
int)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)


OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)


#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \


inline _Tpvec v256_load(const _Tp* ptr)                               \


{ return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \


inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \


{ return _Tpvec(_mm256_load_##suffix(ptr)); }                         \


inline _Tpvec v256_load_low(const _Tp* ptr)                           \


{                                                                     \


return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \


(_mm_loadu_##suffix(ptr)));                          \


}                                                                     \


inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \


{                                                                     \


halfreg vlo = _mm_loadu_##suffix(ptr0);                           \


halfreg vhi = _mm_loadu_##suffix(ptr1);                           \


return _Tpvec(_v256_combine(vlo, vhi));                           \


}                                                                     \


inline void v_store(_Tp* ptr, const _Tpvec& a)                        \


{ _mm256_storeu_##suffix(ptr, a.val); }                               \


inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \


{ _mm256_store_##suffix(ptr, a.val); }                                \


inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \


{ _mm256_stream_##suffix(ptr, a.val); }                               \


inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \


{ \


if( mode == hal::STORE_UNALIGNED ) \


_mm256_storeu_##suffix(ptr, a.val); \


else if( mode == hal::STORE_ALIGNED_NOCACHE )  \


_mm256_stream_##suffix(ptr, a.val); \


else \


_mm256_store_##suffix(ptr, a.val); \


} \


inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \


{ _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \


inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \


{ _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8,
float,  ps, __m128)


OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4,
double, pd, __m128d)


#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \


inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \


{ return _Tpvec(cast(a.val)); }


#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \


inline _Tpvec v256_setzero_##suffix()                                        \


{ return _Tpvec(_mm256_setzero_si256()); }                                   \


inline _Tpvec v256_setall_##suffix(_Tp v)                                    \


{ return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)


OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,
char)


OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,
char)


OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,
short)


OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,
short,    s16, epi16,
short)


OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,
unsigned, u32, epi32,
int)


OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,
int,      s32, epi32,
int)


OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)


OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)


#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \


inline _Tpvec v256_setzero_##suffix()                                \


{ return _Tpvec(_mm256_setzero_##zsuffix()); }                       \


inline _Tpvec v256_setall_##suffix(_Tp v)                            \


{ return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \


OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)


OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8,
float,  f32, ps, _mm256_castsi256_ps)


OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4,
double, f64, pd, _mm256_castsi256_pd)


inline
v_float32x8 v_reinterpret_as_f32(const
v_float32x8& a)


{
return
a; }


inline
v_float32x8 v_reinterpret_as_f32(const
v_float64x4& a)


{
return
v_float32x8(_mm256_castpd_ps(a.val)); }


inline
v_float64x4 v_reinterpret_as_f64(const
v_float64x4& a)


{
return
a; }


inline
v_float64x4 v_reinterpret_as_f64(const
v_float32x8& a)


{
return
v_float64x4(_mm256_castps_pd(a.val)); }


/* Recombine */


/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \


inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \


{ return _Tpvec(perm(a.val, b.val, 0x20)); }                     \


inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \


{ return _Tpvec(perm(a.val, b.val, 0x31)); }                     \


inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \


_Tpvec& c, _Tpvec& d)                   \


{ c = v_combine_low(a, b); d = v_combine_high(a, b); }


#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \


OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \


inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \


_Tpvec& b0, _Tpvec& b1)                 \


{                                                                \


__m256i v0 = _v256_shuffle_odd_64(a0.val);                   \


__m256i v1 = _v256_shuffle_odd_64(a1.val);                   \


b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \


b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \


}


OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)


OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)


OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)


OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)


inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)


{


__m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);


__m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);


v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);


}


inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)


{


__m256d v0 = _v_shuffle_odd_64(a0.val);


__m256d v1 = _v_shuffle_odd_64(a1.val);


b0.val = _mm256_unpacklo_pd(v0, v1);


b1.val = _mm256_unpackhi_pd(v0, v1);


}*/


// unpacks


#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \


inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \


{ return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \


inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \


{ return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }


OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)


OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)


OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)


OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)


OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)


OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)


OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)


OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)


OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)


OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)


// blend


#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \


template<int m>                                             \


inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \


{ return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }


OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)


OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)


OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)


OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)


OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)


OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)


template<int
m>


inline
v_uint64x4 v256_blend(const
v_uint64x4& a,
const
v_uint64x4& b)


{


enum
{M0 = m};


enum
{M1 = (M0 | (M0 << 2)) & 0x33};


enum
{M2 = (M1 | (M1 << 1)) & 0x55};


enum
{MM =  M2 | (M2 << 1)};


return
v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));


}


template<int
m>


inline
v_int64x4 v256_blend(const
v_int64x4& a,
const
v_int64x4& b)


{
return
v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }


// shuffle


// todo: emulate 64bit


#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \


template<int m>                                  \


inline _Tpvec v256_shuffle(const _Tpvec& a)      \


{ return _Tpvec(_mm256_##intrin(a.val, m)); }


OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)


OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)


OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)


OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)


template<typename
_Tpvec>


inline
void
v256_zip(const
_Tpvec& a,
const
_Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)


{


ab0 = v256_unpacklo(a, b);


ab1 = v256_unpackhi(a, b);


}


template<typename
_Tpvec>


inline
_Tpvec v256_combine_diagonal(const
_Tpvec& a,
const
_Tpvec& b)


{
return
_Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }


inline
v_float32x8 v256_combine_diagonal(const
v_float32x8& a,
const
v_float32x8& b)


{
return
v256_blend<0xf0>(a, b); }


inline
v_float64x4 v256_combine_diagonal(const
v_float64x4& a,
const
v_float64x4& b)


{
return
v256_blend<0xc>(a, b); }


template<typename
_Tpvec>


inline
_Tpvec v256_alignr_128(const
_Tpvec& a,
const
_Tpvec& b)


{
return
v256_permute2x128<0x21>(a, b); }


template<typename
_Tpvec>


inline
_Tpvec v256_alignr_64(const
_Tpvec& a,
const
_Tpvec& b)


{
return
_Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }


inline
v_float64x4 v256_alignr_64(const
v_float64x4& a,
const
v_float64x4& b)


{
return
v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }


// todo: emulate float32


template<typename
_Tpvec>


inline
_Tpvec v256_swap_halves(const
_Tpvec& a)


{
return
v256_permute2x128<1>(a, a); }


template<typename
_Tpvec>


inline
_Tpvec v256_reverse_64(const
_Tpvec& a)


{
return
v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }


// ZIP


#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \


inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \


{ return v256_permute2x128<0x20>(a, b); }                        \


inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \


{ return v256_permute2x128<0x31>(a, b); }                        \


inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \


_Tpvec& c, _Tpvec& d)                   \


{                                                                \


_Tpvec a1b0 = v256_alignr_128(a, b);                         \


c = v256_combine_diagonal(a, a1b0);                          \


d = v256_combine_diagonal(a1b0, b);                          \


}                                                                \


inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \


_Tpvec& ab0, _Tpvec& ab1)                      \


{                                                                \


_Tpvec ab0ab2, ab1ab3;                                       \


v256_zip(a, b, ab0ab2, ab1ab3);                              \


v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \


}


OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)


OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)


OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)


OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)


OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)


OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)


OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)


OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)


OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)


OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)


/* Element-wise binary and unary operations */


#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \


inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \


{ return _Tpvec(intrin(a.val, b.val)); }                          \


inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \


{ a.val = intrin(a.val, b.val); return a; }


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)


OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)


OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)


OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)


OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)


OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)


OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)


// saturating multiply 8-bit, 16-bit


inline
v_uint8x32 operator * (const
v_uint8x32& a,
const
v_uint8x32& b)


{


v_uint16x16 c, d;


v_mul_expand(a, b, c, d);


return
v_pack(c, d);


}


inline
v_int8x32 operator * (const
v_int8x32& a,
const
v_int8x32& b)


{


v_int16x16 c, d;


v_mul_expand(a, b, c, d);


return
v_pack(c, d);


}


inline
v_uint16x16 operator * (const
v_uint16x16& a,
const
v_uint16x16& b)


{


__m256i pl = _mm256_mullo_epi16(a.val, b.val);


__m256i ph = _mm256_mulhi_epu16(a.val, b.val);


__m256i p0 = _mm256_unpacklo_epi16(pl, ph);


__m256i p1 = _mm256_unpackhi_epi16(pl, ph);


return
v_uint16x16(_v256_packs_epu32(p0, p1));


}


inline
v_int16x16 operator * (const
v_int16x16& a,
const
v_int16x16& b)


{


__m256i pl = _mm256_mullo_epi16(a.val, b.val);


__m256i ph = _mm256_mulhi_epi16(a.val, b.val);


__m256i p0 = _mm256_unpacklo_epi16(pl, ph);


__m256i p1 = _mm256_unpackhi_epi16(pl, ph);


return
v_int16x16(_mm256_packs_epi32(p0, p1));


}


inline
v_uint8x32& operator *= (v_uint8x32& a,
const
v_uint8x32& b)


{ a = a * b;
return
a; }


inline
v_int8x32& operator *= (v_int8x32& a,
const
v_int8x32& b)


{ a = a * b;
return
a; }


inline
v_uint16x16& operator *= (v_uint16x16& a,
const
v_uint16x16& b)


{ a = a * b;
return
a; }


inline
v_int16x16& operator *= (v_int16x16& a,
const
v_int16x16& b)


{ a = a * b;
return
a; }


#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \


inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \


{ return _Tpvec(intrin(a.val, b.val)); }


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16,  _mm256_mullo_epi16)


inline
v_uint8x32 v_mul_wrap(const
v_uint8x32& a,
const
v_uint8x32& b)


{


__m256i ad = _mm256_srai_epi16(a.val, 8);


__m256i bd = _mm256_srai_epi16(b.val, 8);


__m256i p0 = _mm256_mullo_epi16(a.val, b.val);
// even


__m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8);
// odd


const
__m256i b01 = _mm256_set1_epi32(0xFF00FF00);


return
v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));


}


inline
v_int8x32 v_mul_wrap(const
v_int8x32& a,
const
v_int8x32& b)


{


return
v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));


}


//  Multiply and expand


inline
void
v_mul_expand(const
v_uint8x32& a,
const
v_uint8x32& b,


v_uint16x16& c, v_uint16x16& d)


{


v_uint16x16 a0, a1, b0, b1;


v_expand(a, a0, a1);


v_expand(b, b0, b1);


c = v_mul_wrap(a0, b0);


d = v_mul_wrap(a1, b1);


}


inline
void
v_mul_expand(const
v_int8x32& a,
const
v_int8x32& b,


v_int16x16& c, v_int16x16& d)


{


v_int16x16 a0, a1, b0, b1;


v_expand(a, a0, a1);


v_expand(b, b0, b1);


c = v_mul_wrap(a0, b0);


d = v_mul_wrap(a1, b1);


}


inline
void
v_mul_expand(const
v_int16x16& a,
const
v_int16x16& b,


v_int32x8& c, v_int32x8& d)


{


v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));


v_int16x16 v0, v1;


v_zip(v_mul_wrap(a, b), vhi, v0, v1);


c = v_reinterpret_as_s32(v0);


d = v_reinterpret_as_s32(v1);


}


inline
void
v_mul_expand(const
v_uint16x16& a,
const
v_uint16x16& b,


v_uint32x8& c, v_uint32x8& d)


{


v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));


v_uint16x16 v0, v1;


v_zip(v_mul_wrap(a, b), vhi, v0, v1);


c = v_reinterpret_as_u32(v0);


d = v_reinterpret_as_u32(v1);


}


inline
void
v_mul_expand(const
v_uint32x8& a,
const
v_uint32x8& b,


v_uint64x4& c, v_uint64x4& d)


{


__m256i v0 = _mm256_mul_epu32(a.val, b.val);


__m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));


v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);


}


inline
v_int16x16
v_mul_hi(const
v_int16x16& a,
const
v_int16x16& b) {
return
v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }


inline
v_uint16x16
v_mul_hi(const
v_uint16x16& a,
const
v_uint16x16& b) {
return
v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }


#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \


inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \


{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \


inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \


{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \


inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \


{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \


inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \


{ return _Tpsvec(srai(a.val, imm)); }                             \


template<int imm>                                                 \


inline _Tpuvec v_shl(const _Tpuvec& a)                            \


{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \


template<int imm>                                                 \


inline _Tpsvec v_shl(const _Tpsvec& a)                            \


{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \


template<int imm>                                                 \


inline _Tpuvec v_shr(const _Tpuvec& a)                            \


{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \


template<int imm>                                                 \


inline _Tpsvec v_shr(const _Tpsvec& a)                            \


{ return _Tpsvec(srai(a.val, imm)); }


OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)


OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)


inline
__m256i _mm256_srai_epi64xx(const
__m256i a,
int
imm)


{


__m256i d = _mm256_set1_epi64x((int64)1 << 63);


__m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);


return
_mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));


}


OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)


#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \


OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \


OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \


OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \


inline _Tpvec operator ~ (const _Tpvec& a)                   \


{ return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))


OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))


#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \


inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \


{ return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }


OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)


OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)


OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)


#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \


inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \


{ return ~(a == b); }                                         \


inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \


{ return b > a; }                                             \


inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \


{ return ~(a < b); }                                          \


inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \


{ return b >= a; }


#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \


inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \


{ return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \


inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \


{                                                                    \


__m256i smask = _mm256_set1_##suffix(sbit);                      \


return _Tpuvec(_mm256_cmpgt_##suffix(                            \


_mm256_xor_si256(a.val, smask),                   \


_mm256_xor_si256(b.val, smask)));                 \


}                                                                    \


inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \


{ return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \


inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \


{ return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \


OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \


OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)


OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)


OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)


OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)


#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \


inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \


{ return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \


inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \


{ return ~(a == b); }


OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)


OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)


#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \


inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \


{ return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }


#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \


OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \


OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \


OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \


OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \


OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \


OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)


OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)


OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)


inline
v_float32x8
v_not_nan(const
v_float32x8& a)


{
return
v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }


inline
v_float64x4
v_not_nan(const
v_float64x4& a)


{
return
v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)


OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)


template<int
imm>


inline
v_uint8x32 v_rotate_left(const
v_uint8x32& a,
const
v_uint8x32& b)


{


enum
{IMM_R = (16 - imm) & 0xFF};


enum
{IMM_R2 = (32 - imm) & 0xFF};


if
(imm == 0)
return
a;


if
(imm == 32)
return
b;


if
(imm > 32)
return
v_uint8x32();


__m256i
swap
= _mm256_permute2x128_si256(a.val, b.val, 0x03);


if
(imm == 16)
return
v_uint8x32(swap);


if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(a.val,
swap, IMM_R));


return
v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2));
// imm < 32


}


template<int
imm>


inline
v_uint8x32 v_rotate_right(const
v_uint8x32& a,
const
v_uint8x32& b)


{


enum
{IMM_L = (imm - 16) & 0xFF};


if
(imm == 0)
return
a;


if
(imm == 32)
return
b;


if
(imm > 32)
return
v_uint8x32();


__m256i
swap
= _mm256_permute2x128_si256(a.val, b.val, 0x21);


if
(imm == 16)
return
v_uint8x32(swap);


if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));


return
v_uint8x32(_mm256_alignr_epi8(b.val,
swap, IMM_L));


}


template<int
imm>


inline
v_uint8x32 v_rotate_left(const
v_uint8x32& a)


{


enum
{IMM_L = (imm - 16) & 0xFF};


enum
{IMM_R = (16 - imm) & 0xFF};


if
(imm == 0)
return
a;


if
(imm > 32)
return
v_uint8x32();


// ESAC control[3] ? [127:0] = 0


__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));


if
(imm == 16)
return
v_uint8x32(swapz);


if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));


return
v_uint8x32(_mm256_slli_si256(swapz, IMM_L));


}


template<int
imm>


inline
v_uint8x32 v_rotate_right(const
v_uint8x32& a)


{


enum
{IMM_L = (imm - 16) & 0xFF};


if
(imm == 0)
return
a;


if
(imm > 32)
return
v_uint8x32();


// ESAC control[3] ? [127:0] = 0


__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));


if
(imm == 16)
return
v_uint8x32(swapz);


if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));


return
v_uint8x32(_mm256_srli_si256(swapz, IMM_L));


}


#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)     \


template<int imm>                                             \


inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \


{                                                             \


enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \


v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \


v_reinterpret_as_u8(b));   \


return _Tpvec(cast(ret.val));                             \


}                                                             \


template<int imm>                                             \


inline _Tpvec intrin(const _Tpvec& a)                         \


{                                                             \


enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \


v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \


return _Tpvec(cast(ret.val));                             \


}


#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)


OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)


OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)


OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)


OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)


OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)


OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)


OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)


OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)


inline
v_uint8x32
v_reverse(const
v_uint8x32 &a)


{


static
const
__m256i perm = _mm256_setr_epi8(


15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,


15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);


__m256i vec = _mm256_shuffle_epi8(a.val, perm);


return
v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));


}


inline
v_int8x32
v_reverse(const
v_int8x32 &a)


{
return
v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }


inline
v_uint16x16
v_reverse(const
v_uint16x16 &a)


{


static
const
__m256i perm = _mm256_setr_epi8(


14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,


14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);


__m256i vec = _mm256_shuffle_epi8(a.val, perm);


return
v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));


}


inline
v_int16x16
v_reverse(const
v_int16x16 &a)


{
return
v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }


inline
v_uint32x8
v_reverse(const
v_uint32x8 &a)


{


static
const
__m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);


return
v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));


}


inline
v_int32x8
v_reverse(const
v_int32x8 &a)


{
return
v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }


inline
v_float32x8
v_reverse(const
v_float32x8 &a)


{
return
v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }


inline
v_uint64x4
v_reverse(const
v_uint64x4 &a)


{


return
v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));


}


inline
v_int64x4
v_reverse(const
v_int64x4 &a)


{
return
v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }


inline
v_float64x4
v_reverse(const
v_float64x4 &a)


{
return
v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }


inline
unsigned
v_reduce_sum(const
v_uint8x32& a)


{


__m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());


__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));


}


inline
int
v_reduce_sum(const
v_int8x32& a)


{


__m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());


__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;


}


#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \


inline sctype v_reduce_##func(const _Tpvec& a) \


{ \


__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \


val = intrin(val, _mm_srli_si128(val,8)); \


val = intrin(val, _mm_srli_si128(val,4)); \


val = intrin(val, _mm_srli_si128(val,2)); \


val = intrin(val, _mm_srli_si128(val,1)); \


return (sctype)_mm_cvtsi128_si32(val); \


}


OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar,
min, _mm_min_epu8)


OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar,
min, _mm_min_epi8)


OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar,
max, _mm_max_epu8)


OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar,
max, _mm_max_epi8)


#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \


inline sctype v_reduce_##func(const _Tpvec& a)                  \


{                                                               \


__m128i v0 = _v256_extract_low(a.val);                      \


__m128i v1 = _v256_extract_high(a.val);                     \


v0 = intrin(v0, v1);                                        \


v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \


v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \


v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \


return (sctype) _mm_cvtsi128_si32(v0);                      \


}


OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort,
min, _mm_min_epu16)


OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
min, _mm_min_epi16)


OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort,
max, _mm_max_epu16)


OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
max, _mm_max_epi16)


#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \


inline sctype v_reduce_##func(const _Tpvec& a)                 \


{                                                              \


__m128i v0 = _v256_extract_low(a.val);                     \


__m128i v1 = _v256_extract_high(a.val);                    \


v0 = intrin(v0, v1);                                       \


v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \


v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \


return (sctype) _mm_cvtsi128_si32(v0);                     \


}


OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
min, _mm_min_epu32)


OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
min, _mm_min_epi32)


OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
max, _mm_max_epu32)


OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
max, _mm_max_epi32)


#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \


inline float v_reduce_##func(const v_float32x8& a)                \


{                                                                 \


__m128 v0 = _v256_extract_low(a.val);                         \


__m128 v1 = _v256_extract_high(a.val);                        \


v0 = intrin(v0, v1);                                          \


v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \


v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \


return _mm_cvtss_f32(v0);                                     \


}


OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)


OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)


inline
int
v_reduce_sum(const
v_int32x8& a)


{


__m256i s0 = _mm256_hadd_epi32(a.val, a.val);


s0 = _mm256_hadd_epi32(s0, s0);


__m128i s1 = _v256_extract_high(s0);


s1 = _mm_add_epi32(_v256_extract_low(s0), s1);


return
_mm_cvtsi128_si32(s1);


}


inline
unsigned
v_reduce_sum(const
v_uint32x8& a)


{
return
v_reduce_sum(v_reinterpret_as_s32(a)); }


inline
int
v_reduce_sum(const
v_int16x16& a)


{
return
v_reduce_sum(v_expand_low(a) +
v_expand_high(a)); }


inline
unsigned
v_reduce_sum(const
v_uint16x16& a)


{
return
v_reduce_sum(v_expand_low(a) +
v_expand_high(a)); }


inline
float
v_reduce_sum(const
v_float32x8& a)


{


__m256 s0 = _mm256_hadd_ps(a.val, a.val);


s0 = _mm256_hadd_ps(s0, s0);


__m128 s1 = _v256_extract_high(s0);


s1 = _mm_add_ps(_v256_extract_low(s0), s1);


return
_mm_cvtss_f32(s1);


}


inline
uint64
v_reduce_sum(const
v_uint64x4& a)


{


uint64 CV_DECL_ALIGNED(32) idx[2];


_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));


return idx[0] + idx[1];


}


inline int64
v_reduce_sum(const v_int64x4& a)


{


int64 CV_DECL_ALIGNED(32) idx[2];


_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));


return idx[0] + idx[1];


}


inline
double
v_reduce_sum(const v_float64x4& a)


{


__m256d s0 = _mm256_hadd_pd(a.val, a.val);


return
_mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));


}


inline
v_float32x8
v_reduce_sum4(const
v_float32x8& a,
const
v_float32x8& b,


const
v_float32x8& c,
const
v_float32x8& d)


{


__m256 ab = _mm256_hadd_ps(a.val, b.val);


__m256 cd = _mm256_hadd_ps(c.val, d.val);


return
v_float32x8(_mm256_hadd_ps(ab, cd));


}


inline
unsigned
v_reduce_sad(const
v_uint8x32& a,
const
v_uint8x32& b)


{


__m256i half = _mm256_sad_epu8(a.val, b.val);


__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));


}


inline
unsigned
v_reduce_sad(const
v_int8x32& a,
const
v_int8x32& b)


{


__m256i half = _mm256_set1_epi8(0x7f);


half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));


__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));


return
(unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));


}


inline
unsigned
v_reduce_sad(const
v_uint16x16& a,
const
v_uint16x16& b)


{


v_uint32x8 l, h;


v_expand(v_add_wrap(a - b, b - a), l, h);


return
v_reduce_sum(l + h);


}


inline
unsigned
v_reduce_sad(const
v_int16x16& a,
const
v_int16x16& b)


{


v_uint32x8 l, h;


v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);


return
v_reduce_sum(l + h);


}


inline
unsigned
v_reduce_sad(const
v_uint32x8& a,
const
v_uint32x8& b)


{


return
v_reduce_sum(v_max(a, b) - v_min(a, b));


}


inline
unsigned
v_reduce_sad(const
v_int32x8& a,
const
v_int32x8& b)


{


v_int32x8 m = a < b;


return
v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));


}


inline
float
v_reduce_sad(const
v_float32x8& a,
const
v_float32x8& b)


{


return
v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));


}


inline
v_uint8x32
v_popcount(const
v_uint8x32& a)


{


__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,


0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);


__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);


return
v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(                  a.val    , _popcnt_mask)),


_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));


}


inline
v_uint16x16
v_popcount(const
v_uint16x16& a)


{


v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));


p += v_rotate_right<1>(p);


return
v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);


}


inline
v_uint32x8
v_popcount(const
v_uint32x8& a)


{


v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));


p += v_rotate_right<1>(p);


p += v_rotate_right<2>(p);


return
v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);


}


inline
v_uint64x4
v_popcount(const
v_uint64x4& a)


{


return
v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));


}


inline
v_uint8x32
v_popcount(const
v_int8x32& a)


{
return
v_popcount(v_reinterpret_as_u8(a)); }


inline
v_uint16x16
v_popcount(const
v_int16x16& a)


{
return
v_popcount(v_reinterpret_as_u16(a)); }


inline
v_uint32x8
v_popcount(const
v_int32x8& a)


{
return
v_popcount(v_reinterpret_as_u32(a)); }


inline
v_uint64x4
v_popcount(const
v_int64x4& a)


{
return
v_popcount(v_reinterpret_as_u64(a)); }


inline
int
v_signmask(const
v_int8x32& a)


{
return
_mm256_movemask_epi8(a.val); }


inline
int
v_signmask(const
v_uint8x32& a)


{
return
v_signmask(v_reinterpret_as_s8(a)); }


inline
int
v_signmask(const
v_int16x16& a)


{
return
v_signmask(v_pack(a, a)) & 0xFFFF; }


inline
int
v_signmask(const
v_uint16x16& a)


{
return
v_signmask(v_reinterpret_as_s16(a)); }


inline
int
v_signmask(const
v_float32x8& a)


{
return
_mm256_movemask_ps(a.val); }


inline
int
v_signmask(const
v_float64x4& a)


{
return
_mm256_movemask_pd(a.val); }


inline
int
v_signmask(const
v_int32x8& a)


{
return
v_signmask(v_reinterpret_as_f32(a)); }


inline
int
v_signmask(const
v_uint32x8& a)


{
return
v_signmask(v_reinterpret_as_f32(a)); }


inline
int
v_signmask(const
v_int64x4& a)


{
return
v_signmask(v_reinterpret_as_f64(a)); }


inline
int
v_signmask(const
v_uint64x4& a)


{
return
v_signmask(v_reinterpret_as_f64(a)); }


inline
int
v_scan_forward(const
v_int8x32& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }


inline
int
v_scan_forward(const
v_uint8x32& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }


inline
int
v_scan_forward(const
v_int16x16& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }


inline
int
v_scan_forward(const
v_uint16x16& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }


inline
int
v_scan_forward(const
v_int32x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }


inline
int
v_scan_forward(const
v_uint32x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }


inline
int
v_scan_forward(const
v_float32x8& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }


inline
int
v_scan_forward(const
v_int64x4& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }


inline
int
v_scan_forward(const
v_uint64x4& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }


inline
int
v_scan_forward(const
v_float64x4& a) {
return
trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }


#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \


inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \


inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }


OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)


OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)


OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)


OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)


OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)


OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)


OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)


OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)


#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec)  \


inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \


inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }


OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)


OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)


#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \


inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \


{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \


inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \


{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \


inline _Tpvec v_sqrt(const _Tpvec& x)                                     \


{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \


inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \


{ return v_fma(a, a, b * b); }                                            \


inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \


{ return v_sqrt(v_fma(a, a, b*b)); }


OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)


OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)


inline
v_int32x8
v_fma(const
v_int32x8& a,
const
v_int32x8& b,
const
v_int32x8& c)


{


return
a * b + c;


}


inline
v_int32x8
v_muladd(const
v_int32x8& a,
const
v_int32x8& b,
const
v_int32x8& c)


{


return
v_fma(a, b, c);


}


inline
v_float32x8
v_invsqrt(const
v_float32x8& x)


{


v_float32x8 half = x * v256_setall_f32(0.5);


v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));


// todo: _mm256_fnmsub_ps


t *= v256_setall_f32(1.5) - ((t * t) * half);


return
t;


}


inline
v_float64x4
v_invsqrt(const
v_float64x4& x)


{


return
v256_setall_f64(1.) / v_sqrt(x);


}


#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \


inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \


{ return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }


OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)


OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)


OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)


inline
v_float32x8 v_abs(const
v_float32x8& x)


{
return
x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }


inline
v_float64x4 v_abs(const
v_float64x4& x)


{
return
x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }


inline
v_uint8x32
v_absdiff(const
v_uint8x32& a,
const
v_uint8x32& b)


{
return
v_add_wrap(a - b,  b - a); }


inline
v_uint16x16
v_absdiff(const
v_uint16x16& a,
const
v_uint16x16& b)


{
return
v_add_wrap(a - b,  b - a); }


inline
v_uint32x8
v_absdiff(const
v_uint32x8& a,
const
v_uint32x8& b)


{
return
v_max(a, b) - v_min(a, b); }


inline
v_uint8x32
v_absdiff(const
v_int8x32& a,
const
v_int8x32& b)


{


v_int8x32 d = v_sub_wrap(a, b);


v_int8x32 m = a < b;


return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));


}


inline
v_uint16x16
v_absdiff(const
v_int16x16& a,
const
v_int16x16& b)


{
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }


inline
v_uint32x8
v_absdiff(const
v_int32x8& a,
const
v_int32x8& b)


{


v_int32x8 d = a - b;


v_int32x8 m = a < b;


return
v_reinterpret_as_u32((d ^ m) - m);


}


inline
v_float32x8
v_absdiff(const
v_float32x8& a,
const
v_float32x8& b)


{
return
v_abs(a - b); }


inline
v_float64x4
v_absdiff(const
v_float64x4& a,
const
v_float64x4& b)


{
return
v_abs(a - b); }


inline
v_int8x32
v_absdiffs(const
v_int8x32& a,
const
v_int8x32& b)


{


v_int8x32 d = a - b;


v_int8x32 m = a < b;


return
(d ^ m) - m;


}


inline
v_int16x16
v_absdiffs(const
v_int16x16& a,
const
v_int16x16& b)


{
return
v_max(a, b) - v_min(a, b); }


inline
v_int32x8
v_round(const
v_float32x8& a)


{
return
v_int32x8(_mm256_cvtps_epi32(a.val)); }


inline
v_int32x8
v_round(const
v_float64x4& a)


{
return
v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }


inline
v_int32x8
v_round(const
v_float64x4& a,
const
v_float64x4& b)


{


__m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);


return
v_int32x8(_v256_combine(ai, bi));


}


inline
v_int32x8
v_trunc(const
v_float32x8& a)


{
return
v_int32x8(_mm256_cvttps_epi32(a.val)); }


inline
v_int32x8
v_trunc(const
v_float64x4& a)


{
return
v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }


inline
v_int32x8
v_floor(const
v_float32x8& a)


{
return
v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }


inline
v_int32x8
v_floor(const
v_float64x4& a)


{
return
v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }


inline
v_int32x8
v_ceil(const
v_float32x8& a)


{
return
v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }


inline
v_int32x8
v_ceil(const
v_float64x4& a)


{
return
v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }


inline
v_float32x8
v_cvt_f32(const
v_int32x8& a)


{
return
v_float32x8(_mm256_cvtepi32_ps(a.val)); }


inline
v_float32x8
v_cvt_f32(const
v_float64x4& a)


{
return
v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }


inline
v_float32x8
v_cvt_f32(const
v_float64x4& a,
const
v_float64x4& b)


{


__m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);


return
v_float32x8(_v256_combine(af, bf));


}


inline
v_float64x4
v_cvt_f64(const
v_int32x8& a)


{
return
v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }


inline
v_float64x4
v_cvt_f64_high(const
v_int32x8& a)


{
return
v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }


inline
v_float64x4
v_cvt_f64(const
v_float32x8& a)


{
return
v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }


inline
v_float64x4
v_cvt_f64_high(const
v_float32x8& a)


{
return
v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }


// from (Mysticial and wim) https://stackoverflow.com/q/41144668


inline
v_float64x4
v_cvt_f64(const
v_int64x4& v)


{


// constants encoded as floating-point


__m256i magic_i_lo   = _mm256_set1_epi64x(0x4330000000000000);
// 2^52


__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000);
// 2^84 + 2^63


__m256i magic_i_all  = _mm256_set1_epi64x(0x4530000080100000);
// 2^84 + 2^63 + 2^52


__m256d magic_d_all  = _mm256_castsi256_pd(magic_i_all);


// Blend the 32 lowest significant bits of v with magic_int_lo


__m256i v_lo         = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);


// Extract the 32 most significant bits of v


__m256i v_hi         = _mm256_srli_epi64(v.val, 32);


// Flip the msb of v_hi and blend with 0x45300000


v_hi         = _mm256_xor_si256(v_hi, magic_i_hi32);


// Compute in double precision


__m256d v_hi_dbl     = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);


// (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition


__m256d result       = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));


return
v_float64x4(result);


}


inline
v_int8x32 v256_lut(const
schar* tab,
const
int* idx)


{


return
v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],


tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],


tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],


tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));


}


inline
v_int8x32 v256_lut_pairs(const
schar* tab,
const
int* idx)


{


return
v_int8x32(_mm256_setr_epi16(*(const
short*)(tab + idx[ 0]), *(const
short*)(tab + idx[ 1]), *(const
short*)(tab + idx[ 2]), *(const
short*)(tab + idx[ 3]),


*(const
short*)(tab + idx[ 4]), *(const
short*)(tab + idx[ 5]), *(const
short*)(tab + idx[ 6]), *(const
short*)(tab + idx[ 7]),


*(const
short*)(tab + idx[ 8]), *(const
short*)(tab + idx[ 9]), *(const
short*)(tab + idx[10]), *(const
short*)(tab + idx[11]),


*(const
short*)(tab + idx[12]), *(const
short*)(tab + idx[13]), *(const
short*)(tab + idx[14]), *(const
short*)(tab + idx[15])));


}


inline
v_int8x32 v256_lut_quads(const
schar* tab,
const
int* idx)


{


return
v_int8x32(_mm256_i32gather_epi32((const
int*)tab, _mm256_loadu_si256((const
__m256i*)idx), 1));


}


inline
v_uint8x32 v256_lut(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut((const
schar *)tab, idx)); }


inline
v_uint8x32 v256_lut_pairs(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut_pairs((const
schar *)tab, idx)); }


inline
v_uint8x32 v256_lut_quads(const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut_quads((const
schar *)tab, idx)); }


inline
v_int16x16 v256_lut(const
short* tab,
const
int* idx)


{


return
v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],


tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));


}


inline
v_int16x16 v256_lut_pairs(const
short* tab,
const
int* idx)


{


return
v_int16x16(_mm256_i32gather_epi32((const
int*)tab, _mm256_loadu_si256((const
__m256i*)idx), 2));


}


inline
v_int16x16 v256_lut_quads(const
short* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int16x16(_mm256_i32gather_epi64((const
long
long
int*)tab, _mm_loadu_si128((const
__m128i*)idx), 2));//Looks like intrinsic has wrong definition


#else


return
v_int16x16(_mm256_i32gather_epi64((const
int64*)tab, _mm_loadu_si128((const
__m128i*)idx), 2));


#endif


}


inline
v_uint16x16 v256_lut(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut((const
short
*)tab, idx)); }


inline
v_uint16x16 v256_lut_pairs(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut_pairs((const
short
*)tab, idx)); }


inline
v_uint16x16 v256_lut_quads(const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut_quads((const
short
*)tab, idx)); }


inline
v_int32x8 v256_lut(const
int* tab,
const
int* idx)


{


return
v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const
__m256i*)idx), 4));


}


inline
v_int32x8 v256_lut_pairs(const
int* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int32x8(_mm256_i32gather_epi64((const
long
long
int*)tab, _mm_loadu_si128((const
__m128i*)idx), 4));


#else


return
v_int32x8(_mm256_i32gather_epi64((const
int64*)tab, _mm_loadu_si128((const
__m128i*)idx), 4));


#endif


}


inline
v_int32x8 v256_lut_quads(const
int* tab,
const
int* idx)


{


return
v_int32x8(_v256_combine(_mm_loadu_si128((const
__m128i*)(tab + idx[0])), _mm_loadu_si128((const
__m128i*)(tab + idx[1]))));


}


inline
v_uint32x8 v256_lut(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut((const
int
*)tab, idx)); }


inline
v_uint32x8 v256_lut_pairs(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut_pairs((const
int
*)tab, idx)); }


inline
v_uint32x8 v256_lut_quads(const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut_quads((const
int
*)tab, idx)); }


inline
v_int64x4 v256_lut(const
int64* tab,
const
int* idx)


{


#if defined(__GNUC__)


return
v_int64x4(_mm256_i32gather_epi64((const
long
long
int*)tab, _mm_loadu_si128((const
__m128i*)idx), 8));


#else


return
v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const
__m128i*)idx), 8));


#endif


}


inline
v_int64x4 v256_lut_pairs(const
int64* tab,
const
int* idx)


{


return
v_int64x4(_v256_combine(_mm_loadu_si128((const
__m128i*)(tab + idx[0])), _mm_loadu_si128((const
__m128i*)(tab + idx[1]))));


}


inline
v_uint64x4 v256_lut(const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v256_lut((const
int64 *)tab, idx)); }


inline
v_uint64x4 v256_lut_pairs(const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v256_lut_pairs((const
int64 *)tab, idx)); }


inline
v_float32x8 v256_lut(const
float* tab,
const
int* idx)


{


return
v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const
__m256i*)idx), 4));


}


inline
v_float32x8 v256_lut_pairs(const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v256_lut_pairs((const
int
*)tab, idx)); }


inline
v_float32x8 v256_lut_quads(const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v256_lut_quads((const
int
*)tab, idx)); }


inline
v_float64x4 v256_lut(const
double* tab,
const
int* idx)


{


return
v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const
__m128i*)idx), 8));


}


inline
v_float64x4 v256_lut_pairs(const
double* tab,
const
int* idx) {
return
v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }


inline
v_int32x8 v_lut(const
int* tab,
const
v_int32x8& idxvec)


{


return
v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));


}


inline
v_uint32x8 v_lut(const
unsigned* tab,
const
v_int32x8& idxvec)


{


return
v_reinterpret_as_u32(v_lut((const
int
*)tab, idxvec));


}


inline
v_float32x8 v_lut(const
float* tab,
const
v_int32x8& idxvec)


{


return
v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));


}


inline
v_float64x4 v_lut(const
double* tab,
const
v_int32x8& idxvec)


{


return
v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));


}


inline
void
v_lut_deinterleave(const
float* tab,
const
v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)


{


int
CV_DECL_ALIGNED(32) idx[8];


v_store_aligned(idx, idxvec);


__m128 z = _mm_setzero_ps();


__m128 xy01, xy45, xy23, xy67;


xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));


xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));


xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));


xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));


__m256 xy0145 = _v256_combine(xy01, xy45);


xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));


xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));


xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));


xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));


__m256 xy2367 = _v256_combine(xy23, xy67);


__m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);


__m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);


x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));


y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));


}


inline
void
v_lut_deinterleave(const
double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)


{


int
CV_DECL_ALIGNED(32) idx[4];


v_store_low(idx, idxvec);


__m128d xy0 = _mm_loadu_pd(tab + idx[0]);


__m128d xy2 = _mm_loadu_pd(tab + idx[2]);


__m128d xy1 = _mm_loadu_pd(tab + idx[1]);


__m128d xy3 = _mm_loadu_pd(tab + idx[3]);


__m256d xy02 = _v256_combine(xy0, xy2);


__m256d xy13 = _v256_combine(xy1, xy3);


x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));


y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));


}


inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)


{


return
v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));


}


inline
v_uint8x32 v_interleave_pairs(const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }


inline
v_int8x32 v_interleave_quads(const
v_int8x32& vec)


{


return
v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));


}


inline
v_uint8x32 v_interleave_quads(const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }


inline
v_int16x16 v_interleave_pairs(const
v_int16x16& vec)


{


return
v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));


}


inline
v_uint16x16 v_interleave_pairs(const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }


inline
v_int16x16 v_interleave_quads(const
v_int16x16& vec)


{


return
v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));


}


inline
v_uint16x16 v_interleave_quads(const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }


inline
v_int32x8 v_interleave_pairs(const
v_int32x8& vec)


{


return
v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));


}


inline
v_uint32x8 v_interleave_pairs(const
v_uint32x8& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }


inline
v_float32x8 v_interleave_pairs(const
v_float32x8& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }


inline
v_int8x32 v_pack_triplets(const
v_int8x32& vec)


{


return
v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),


_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));


}


inline
v_uint8x32 v_pack_triplets(const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }


inline
v_int16x16 v_pack_triplets(const
v_int16x16& vec)


{


return
v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),


_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));


}


inline
v_uint16x16 v_pack_triplets(const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }


inline
v_int32x8 v_pack_triplets(const
v_int32x8& vec)


{


return
v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));


}


inline
v_uint32x8 v_pack_triplets(const
v_uint32x8& vec) {
return
v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }


inline
v_float32x8 v_pack_triplets(const
v_float32x8& vec)


{


return
v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));


}


// 16 >> 32


inline
v_int32x8
v_dotprod(const
v_int16x16& a,
const
v_int16x16& b)


{
return
v_int32x8(_mm256_madd_epi16(a.val, b.val)); }


inline
v_int32x8
v_dotprod(const
v_int16x16& a,
const
v_int16x16& b,
const
v_int32x8& c)


{
return
v_dotprod(a, b) + c; }


// 32 >> 64


inline
v_int64x4
v_dotprod(const
v_int32x8& a,
const
v_int32x8& b)


{


__m256i even = _mm256_mul_epi32(a.val, b.val);


__m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));


return
v_int64x4(_mm256_add_epi64(even, odd));


}


inline
v_int64x4
v_dotprod(const
v_int32x8& a,
const
v_int32x8& b,
const
v_int64x4& c)


{
return
v_dotprod(a, b) + c; }


// 8 >> 32


inline
v_uint32x8
v_dotprod_expand(const
v_uint8x32& a,
const
v_uint8x32& b)


{


__m256i even_m = _mm256_set1_epi32(0xFF00FF00);


__m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);


__m256i odd_a  = _mm256_srli_epi16(a.val, 8);


__m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);


__m256i odd_b  = _mm256_srli_epi16(b.val, 8);


__m256i prod0  = _mm256_madd_epi16(even_a, even_b);


__m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);


return
v_uint32x8(_mm256_add_epi32(prod0, prod1));


}


inline
v_uint32x8
v_dotprod_expand(const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint32x8& c)


{
return
v_dotprod_expand(a, b) + c; }


inline
v_int32x8
v_dotprod_expand(const
v_int8x32& a,
const
v_int8x32& b)


{


__m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);


__m256i odd_a  = _mm256_srai_epi16(a.val, 8);


__m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);


__m256i odd_b  = _mm256_srai_epi16(b.val, 8);


__m256i prod0  = _mm256_madd_epi16(even_a, even_b);


__m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);


return
v_int32x8(_mm256_add_epi32(prod0, prod1));


}


inline
v_int32x8
v_dotprod_expand(const
v_int8x32& a,
const
v_int8x32& b,
const
v_int32x8& c)


{
return
v_dotprod_expand(a, b) + c; }


// 16 >> 64


inline
v_uint64x4
v_dotprod_expand(const
v_uint16x16& a,
const
v_uint16x16& b)


{


__m256i mullo = _mm256_mullo_epi16(a.val, b.val);


__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);


__m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);


__m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);


__m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);


__m256i p13   = _mm256_srli_epi64(mul0, 32);


__m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);


__m256i p57   = _mm256_srli_epi64(mul1, 32);


__m256i p15_  = _mm256_add_epi64(p02, p13);


__m256i p9d_  = _mm256_add_epi64(p46, p57);


return
v_uint64x4(_mm256_add_epi64(


_mm256_unpacklo_epi64(p15_, p9d_),


_mm256_unpackhi_epi64(p15_, p9d_)


));


}


inline
v_uint64x4
v_dotprod_expand(const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint64x4& c)


{
return
v_dotprod_expand(a, b) + c; }


inline
v_int64x4
v_dotprod_expand(const
v_int16x16& a,
const
v_int16x16& b)


{


__m256i prod = _mm256_madd_epi16(a.val, b.val);


__m256i sign = _mm256_srai_epi32(prod, 31);


__m256i lo = _mm256_unpacklo_epi32(prod, sign);


__m256i hi = _mm256_unpackhi_epi32(prod, sign);


return
v_int64x4(_mm256_add_epi64(


_mm256_unpacklo_epi64(lo, hi),


_mm256_unpackhi_epi64(lo, hi)


));


}


inline
v_int64x4
v_dotprod_expand(const
v_int16x16& a,
const
v_int16x16& b,
const
v_int64x4& c)


{
return
v_dotprod_expand(a, b) + c; }


// 32 >> 64f


inline
v_float64x4
v_dotprod_expand(const
v_int32x8& a,
const
v_int32x8& b)


{
return
v_cvt_f64(v_dotprod(a, b)); }


inline
v_float64x4
v_dotprod_expand(const
v_int32x8& a,
const
v_int32x8& b,
const
v_float64x4& c)


{
return
v_dotprod_expand(a, b) + c; }


// 16 >> 32


inline
v_int32x8
v_dotprod_fast(const
v_int16x16& a,
const
v_int16x16& b)


{
return
v_dotprod(a, b); }


inline
v_int32x8
v_dotprod_fast(const
v_int16x16& a,
const
v_int16x16& b,
const
v_int32x8& c)


{
return
v_dotprod(a, b, c); }


// 32 >> 64


inline
v_int64x4
v_dotprod_fast(const
v_int32x8& a,
const
v_int32x8& b)


{
return
v_dotprod(a, b); }


inline
v_int64x4
v_dotprod_fast(const
v_int32x8& a,
const
v_int32x8& b,
const
v_int64x4& c)


{
return
v_dotprod(a, b, c); }


// 8 >> 32


inline
v_uint32x8
v_dotprod_expand_fast(const
v_uint8x32& a,
const
v_uint8x32& b)


{
return
v_dotprod_expand(a, b); }


inline
v_uint32x8
v_dotprod_expand_fast(const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint32x8& c)


{
return
v_dotprod_expand(a, b, c); }


inline
v_int32x8
v_dotprod_expand_fast(const
v_int8x32& a,
const
v_int8x32& b)


{
return
v_dotprod_expand(a, b); }


inline
v_int32x8
v_dotprod_expand_fast(const
v_int8x32& a,
const
v_int8x32& b,
const
v_int32x8& c)


{
return
v_dotprod_expand(a, b, c); }


// 16 >> 64


inline
v_uint64x4
v_dotprod_expand_fast(const
v_uint16x16& a,
const
v_uint16x16& b)


{


__m256i mullo = _mm256_mullo_epi16(a.val, b.val);


__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);


__m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);


__m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);


__m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);


__m256i p13   = _mm256_srli_epi64(mul0, 32);


__m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);


__m256i p57   = _mm256_srli_epi64(mul1, 32);


__m256i p15_  = _mm256_add_epi64(p02, p13);


__m256i p9d_  = _mm256_add_epi64(p46, p57);


return
v_uint64x4(_mm256_add_epi64(p15_, p9d_));


}


inline
v_uint64x4
v_dotprod_expand_fast(const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint64x4& c)


{
return
v_dotprod_expand_fast(a, b) + c; }


inline
v_int64x4
v_dotprod_expand_fast(const
v_int16x16& a,
const
v_int16x16& b)


{


__m256i prod = _mm256_madd_epi16(a.val, b.val);


__m256i sign = _mm256_srai_epi32(prod, 31);


__m256i lo = _mm256_unpacklo_epi32(prod, sign);


__m256i hi = _mm256_unpackhi_epi32(prod, sign);


return
v_int64x4(_mm256_add_epi64(lo, hi));


}


inline
v_int64x4
v_dotprod_expand_fast(const
v_int16x16& a,
const
v_int16x16& b,
const
v_int64x4& c)


{
return
v_dotprod_expand_fast(a, b) + c; }


// 32 >> 64f


inline
v_float64x4
v_dotprod_expand_fast(const
v_int32x8& a,
const
v_int32x8& b)


{
return
v_dotprod_expand(a, b); }


inline
v_float64x4
v_dotprod_expand_fast(const
v_int32x8& a,
const
v_int32x8& b,
const
v_float64x4& c)


{
return
v_dotprod_expand(a, b, c); }


#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \


v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))


inline
v_float32x8
v_matmul(const
v_float32x8& v,
const
v_float32x8& m0,


const
v_float32x8& m1,
const
v_float32x8& m2,


const
v_float32x8& m3)


{


v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);


v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);


v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);


v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);


return
v_fma(v04, m0,
v_fma(v15, m1,
v_fma(v26, m2, v37 * m3)));


}


inline
v_float32x8
v_matmuladd(const
v_float32x8& v,
const
v_float32x8& m0,


const
v_float32x8& m1,
const
v_float32x8& m2,


const
v_float32x8& a)


{


v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);


v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);


v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);


return
v_fma(v04, m0,
v_fma(v15, m1,
v_fma(v26, m2, a)));


}


#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \


inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \


const _Tpvec& a2, const _Tpvec& a3,              \


_Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \


{                                                                           \


__m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \


__m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \


__m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \


__m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \


b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \


b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \


b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \


b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \


}


OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)


OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)


OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)


/* Expand */


#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \


inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \


{                                                               \


b0.val = intrin(_v256_extract_low(a.val));                  \


b1.val = intrin(_v256_extract_high(a.val));                 \


}                                                               \


inline _Tpwvec v_expand_low(const _Tpvec& a)                    \


{ return _Tpwvec(intrin(_v256_extract_low(a.val))); }           \


inline _Tpwvec v_expand_high(const _Tpvec& a)                   \


{ return _Tpwvec(intrin(_v256_extract_high(a.val))); }          \


inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \


{                                                               \


__m128i a = _mm_loadu_si128((const __m128i*)ptr);           \


return _Tpwvec(intrin(a));                                  \


}


OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)


OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)


OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)


OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,
short,    _mm256_cvtepi16_epi32)


OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,
unsigned, _mm256_cvtepu32_epi64)


OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,
int,      _mm256_cvtepi32_epi64)


#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \


inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \


{                                                       \


__m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \


return _Tpvec(intrin(a));                           \


}


OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)


OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)


/* pack */


// 16


inline
v_int8x32 v_pack(const
v_int16x16& a,
const
v_int16x16& b)


{
return
v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }


inline
v_uint8x32 v_pack(const
v_uint16x16& a,
const
v_uint16x16& b)


{


__m256i t = _mm256_set1_epi16(255);


__m256i a1 = _mm256_min_epu16(a.val, t);


__m256i b1 = _mm256_min_epu16(b.val, t);


return
v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));


}


inline
v_uint8x32 v_pack_u(const
v_int16x16& a,
const
v_int16x16& b)


{


return
v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));


}


inline
void
v_pack_store(schar* ptr,
const
v_int16x16& a)


{
v_store_low(ptr, v_pack(a, a)); }


inline
void
v_pack_store(uchar* ptr,
const
v_uint16x16& a)


{


const
__m256i m = _mm256_set1_epi16(255);


__m256i am = _mm256_min_epu16(a.val, m);


am =  _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));


v_store_low(ptr, v_uint8x32(am));


}


inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x16& a)


{
v_store_low(ptr, v_pack_u(a, a)); }


template<int
n>
inline


v_uint8x32 v_rshr_pack(const
v_uint16x16& a,
const
v_uint16x16& b)


{


// we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.


v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));


return
v_pack_u(v_reinterpret_as_s16((a + delta) >> n),


v_reinterpret_as_s16((b + delta) >> n));


}


template<int
n>
inline


void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x16& a)


{


v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));


v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));


}


template<int
n>
inline


v_uint8x32 v_rshr_pack_u(const
v_int16x16& a,
const
v_int16x16& b)


{


v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));


return
v_pack_u((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x16& a)


{


v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));


v_pack_u_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int8x32 v_rshr_pack(const
v_int16x16& a,
const
v_int16x16& b)


{


v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(schar* ptr,
const
v_int16x16& a)


{


v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));


v_pack_store(ptr, (a + delta) >> n);


}


// 32


inline
v_int16x16 v_pack(const
v_int32x8& a,
const
v_int32x8& b)


{
return
v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }


inline
v_uint16x16 v_pack(const
v_uint32x8& a,
const
v_uint32x8& b)


{
return
v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }


inline
v_uint16x16 v_pack_u(const
v_int32x8& a,
const
v_int32x8& b)


{
return
v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }


inline
void
v_pack_store(short* ptr,
const
v_int32x8& a)


{
v_store_low(ptr, v_pack(a, a)); }


inline
void
v_pack_store(ushort* ptr,
const
v_uint32x8& a)


{


const
__m256i m = _mm256_set1_epi32(65535);


__m256i am = _mm256_min_epu32(a.val, m);


am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));


v_store_low(ptr, v_uint16x16(am));


}


inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x8& a)


{
v_store_low(ptr, v_pack_u(a, a)); }


template<int
n>
inline


v_uint16x16 v_rshr_pack(const
v_uint32x8& a,
const
v_uint32x8& b)


{


// we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.


v_uint32x8 delta = v256_setall_u32(1 << (n-1));


return
v_pack_u(v_reinterpret_as_s32((a + delta) >> n),


v_reinterpret_as_s32((b + delta) >> n));


}


template<int
n>
inline


void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x8& a)


{


v_uint32x8 delta = v256_setall_u32(1 << (n-1));


v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));


}


template<int
n>
inline


v_uint16x16 v_rshr_pack_u(const
v_int32x8& a,
const
v_int32x8& b)


{


v_int32x8 delta = v256_setall_s32(1 << (n-1));


return
v_pack_u((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x8& a)


{


v_int32x8 delta = v256_setall_s32(1 << (n-1));


v_pack_u_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int16x16 v_rshr_pack(const
v_int32x8& a,
const
v_int32x8& b)


{


v_int32x8 delta = v256_setall_s32(1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(short* ptr,
const
v_int32x8& a)


{


v_int32x8 delta = v256_setall_s32(1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


// 64


// Non-saturating pack


inline
v_uint32x8 v_pack(const
v_uint64x4& a,
const
v_uint64x4& b)


{


__m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));


__m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));


__m256i ab = _mm256_unpacklo_epi64(a0, b0);
// a0, a1, b0, b1, a2, a3, b2, b3


return
v_uint32x8(_v256_shuffle_odd_64(ab));


}


inline
v_int32x8 v_pack(const
v_int64x4& a,
const
v_int64x4& b)


{
return
v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }


inline
void
v_pack_store(unsigned* ptr,
const
v_uint64x4& a)


{


__m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));


v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));


}


inline
void
v_pack_store(int* ptr,
const
v_int64x4& b)


{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }


template<int
n>
inline


v_uint32x8 v_rshr_pack(const
v_uint64x4& a,
const
v_uint64x4& b)


{


v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(unsigned* ptr,
const
v_uint64x4& a)


{


v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


template<int
n>
inline


v_int32x8 v_rshr_pack(const
v_int64x4& a,
const
v_int64x4& b)


{


v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));


return
v_pack((a + delta) >> n, (b + delta) >> n);


}


template<int
n>
inline


void
v_rshr_pack_store(int* ptr,
const
v_int64x4& a)


{


v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));


v_pack_store(ptr, (a + delta) >> n);


}


// pack boolean


inline
v_uint8x32
v_pack_b(const
v_uint16x16& a,
const
v_uint16x16& b)


{


__m256i ab = _mm256_packs_epi16(a.val, b.val);


return
v_uint8x32(_v256_shuffle_odd_64(ab));


}


inline
v_uint8x32
v_pack_b(const
v_uint32x8& a,
const
v_uint32x8& b,


const
v_uint32x8& c,
const
v_uint32x8& d)


{


__m256i ab = _mm256_packs_epi32(a.val, b.val);


__m256i cd = _mm256_packs_epi32(c.val, d.val);


__m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));


return
v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));


}


inline
v_uint8x32
v_pack_b(const
v_uint64x4& a,
const
v_uint64x4& b,
const
v_uint64x4& c,


const
v_uint64x4& d,
const
v_uint64x4& e,
const
v_uint64x4& f,


const
v_uint64x4& g,
const
v_uint64x4& h)


{


__m256i ab = _mm256_packs_epi32(a.val, b.val);


__m256i cd = _mm256_packs_epi32(c.val, d.val);


__m256i ef = _mm256_packs_epi32(e.val, f.val);


__m256i gh = _mm256_packs_epi32(g.val, h.val);


__m256i abcd = _mm256_packs_epi32(ab, cd);


__m256i efgh = _mm256_packs_epi32(ef, gh);


__m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));


__m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);


return
v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));


}


/* Recombine */


// its up there with load and store operations


/* Extract */


#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \


template<int s>                                            \


inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \


{ return v_rotate_right<s>(a, b); }


OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)


OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)


template<int
i>


inline
uchar
v_extract_n(v_uint8x32 a)


{


return
(uchar)_v256_extract_epi8<i>(a.val);


}


template<int
i>


inline
schar
v_extract_n(v_int8x32 a)


{


return
(schar)v_extract_n<i>(v_reinterpret_as_u8(a));


}


template<int
i>


inline
ushort
v_extract_n(v_uint16x16 a)


{


return
(ushort)_v256_extract_epi16<i>(a.val);


}


template<int
i>


inline
short
v_extract_n(v_int16x16 a)


{


return
(short)v_extract_n<i>(v_reinterpret_as_u16(a));


}


template<int
i>


inline
uint
v_extract_n(v_uint32x8 a)


{


return
(uint)_v256_extract_epi32<i>(a.val);


}


template<int
i>


inline
int
v_extract_n(v_int32x8 a)


{


return
(int)v_extract_n<i>(v_reinterpret_as_u32(a));


}


template<int
i>


inline
uint64
v_extract_n(v_uint64x4 a)


{


return
(uint64)_v256_extract_epi64<i>(a.val);


}


template<int
i>


inline
int64
v_extract_n(v_int64x4 v)


{


return
(int64)v_extract_n<i>(v_reinterpret_as_u64(v));


}


template<int
i>


inline
float
v_extract_n(v_float32x8 v)


{


union
{ uint iv;
float
fv; } d;


d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));


return
d.fv;


}


template<int
i>


inline
double
v_extract_n(v_float64x4 v)


{


union
{ uint64 iv;
double
dv; } d;


d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));


return
d.dv;


}


template<int
i>


inline
v_uint32x8
v_broadcast_element(v_uint32x8 a)


{


static
const
__m256i perm = _mm256_set1_epi32((char)i);


return
v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));


}


template<int
i>


inline
v_int32x8
v_broadcast_element(const
v_int32x8 &a)


{
return
v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


template<int
i>


inline
v_float32x8
v_broadcast_element(const
v_float32x8 &a)


{
return
v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x32& a, v_uint8x32& b )


{


__m256i ab0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i ab1 = _mm256_loadu_si256((const
__m256i*)(ptr + 32));


const
__m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,


0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);


__m256i p0 = _mm256_shuffle_epi8(ab0, sh);


__m256i p1 = _mm256_shuffle_epi8(ab1, sh);


__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);


__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);


__m256i a0 = _mm256_unpacklo_epi64(pl, ph);


__m256i b0 = _mm256_unpackhi_epi64(pl, ph);


a = v_uint8x32(a0);


b = v_uint8x32(b0);


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x16& a, v_uint16x16& b )


{


__m256i ab0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i ab1 = _mm256_loadu_si256((const
__m256i*)(ptr + 16));


const
__m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,


0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);


__m256i p0 = _mm256_shuffle_epi8(ab0, sh);


__m256i p1 = _mm256_shuffle_epi8(ab1, sh);


__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);


__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);


__m256i a0 = _mm256_unpacklo_epi64(pl, ph);


__m256i b0 = _mm256_unpackhi_epi64(pl, ph);


a = v_uint16x16(a0);


b = v_uint16x16(b0);


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x8& a, v_uint32x8& b )


{


__m256i ab0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i ab1 = _mm256_loadu_si256((const
__m256i*)(ptr + 8));


const
int
sh = 0+2*4+1*16+3*64;


__m256i p0 = _mm256_shuffle_epi32(ab0, sh);


__m256i p1 = _mm256_shuffle_epi32(ab1, sh);


__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);


__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);


__m256i a0 = _mm256_unpacklo_epi64(pl, ph);


__m256i b0 = _mm256_unpackhi_epi64(pl, ph);


a = v_uint32x8(a0);


b = v_uint32x8(b0);


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x4& a, v_uint64x4& b )


{


__m256i ab0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i ab1 = _mm256_loadu_si256((const
__m256i*)(ptr + 4));


__m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);


__m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);


__m256i a0 = _mm256_unpacklo_epi64(pl, ph);


__m256i b0 = _mm256_unpackhi_epi64(pl, ph);


a = v_uint64x4(a0);


b = v_uint64x4(b0);


}


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 32));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 64));


__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);


__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);


const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,


0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);


const
__m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,


-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);


__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);


__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);


__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);


const
__m256i


sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,


0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),


sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,


1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),


sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,


2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);


b0 = _mm256_shuffle_epi8(b0, sh_b);


g0 = _mm256_shuffle_epi8(g0, sh_g);


r0 = _mm256_shuffle_epi8(r0, sh_r);


a = v_uint8x32(b0);


b = v_uint8x32(g0);


c = v_uint8x32(r0);


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 16));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 32));


__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);


__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);


const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,


0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);


const
__m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,


-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);


__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);


__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);


__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);


const
__m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,


0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);


const
__m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,


2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);


const
__m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,


4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);


b0 = _mm256_shuffle_epi8(b0, sh_b);


g0 = _mm256_shuffle_epi8(g0, sh_g);


r0 = _mm256_shuffle_epi8(r0, sh_r);


a = v_uint16x16(b0);


b = v_uint16x16(g0);


c = v_uint16x16(r0);


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 8));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 16));


__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);


__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);


__m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);


__m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);


__m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);


b0 = _mm256_shuffle_epi32(b0, 0x6c);


g0 = _mm256_shuffle_epi32(g0, 0xb1);


r0 = _mm256_shuffle_epi32(r0, 0xc6);


a = v_uint32x8(b0);


b = v_uint32x8(g0);


c = v_uint32x8(r0);


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 4));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 8));


__m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);


__m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);


__m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);


__m256i b0 = _mm256_unpacklo_epi64(s01, s20r);


__m256i g0 = _mm256_alignr_epi8(s12, s01, 8);


__m256i r0 = _mm256_unpackhi_epi64(s20r, s12);


a = v_uint64x4(b0);


b = v_uint64x4(g0);


c = v_uint64x4(r0);


}


inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 32));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 64));


__m256i bgr3 = _mm256_loadu_si256((const
__m256i*)(ptr + 96));


const
__m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,


0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);


__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);


__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);


__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);


__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);


__m256i p01l = _mm256_unpacklo_epi32(p0, p1);


__m256i p01h = _mm256_unpackhi_epi32(p0, p1);


__m256i p23l = _mm256_unpacklo_epi32(p2, p3);


__m256i p23h = _mm256_unpackhi_epi32(p2, p3);


__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);


__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);


__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);


__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);


__m256i b0 = _mm256_unpacklo_epi32(pll, plh);


__m256i g0 = _mm256_unpackhi_epi32(pll, plh);


__m256i r0 = _mm256_unpacklo_epi32(phl, phh);


__m256i a0 = _mm256_unpackhi_epi32(phl, phh);


a = v_uint8x32(b0);


b = v_uint8x32(g0);


c = v_uint8x32(r0);


d = v_uint8x32(a0);


}


inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )


{


__m256i bgr0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgr1 = _mm256_loadu_si256((const
__m256i*)(ptr + 16));


__m256i bgr2 = _mm256_loadu_si256((const
__m256i*)(ptr + 32));


__m256i bgr3 = _mm256_loadu_si256((const
__m256i*)(ptr + 48));


const
__m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,


0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);


__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);


__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);


__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);


__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);


__m256i p01l = _mm256_unpacklo_epi32(p0, p1);


__m256i p01h = _mm256_unpackhi_epi32(p0, p1);


__m256i p23l = _mm256_unpacklo_epi32(p2, p3);


__m256i p23h = _mm256_unpackhi_epi32(p2, p3);


__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);


__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);


__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);


__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);


__m256i b0 = _mm256_unpacklo_epi32(pll, plh);


__m256i g0 = _mm256_unpackhi_epi32(pll, plh);


__m256i r0 = _mm256_unpacklo_epi32(phl, phh);


__m256i a0 = _mm256_unpackhi_epi32(phl, phh);


a = v_uint16x16(b0);


b = v_uint16x16(g0);


c = v_uint16x16(r0);


d = v_uint16x16(a0);


}


inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )


{


__m256i p0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i p1 = _mm256_loadu_si256((const
__m256i*)(ptr + 8));


__m256i p2 = _mm256_loadu_si256((const
__m256i*)(ptr + 16));


__m256i p3 = _mm256_loadu_si256((const
__m256i*)(ptr + 24));


__m256i p01l = _mm256_unpacklo_epi32(p0, p1);


__m256i p01h = _mm256_unpackhi_epi32(p0, p1);


__m256i p23l = _mm256_unpacklo_epi32(p2, p3);


__m256i p23h = _mm256_unpackhi_epi32(p2, p3);


__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);


__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);


__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);


__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);


__m256i b0 = _mm256_unpacklo_epi32(pll, plh);


__m256i g0 = _mm256_unpackhi_epi32(pll, plh);


__m256i r0 = _mm256_unpacklo_epi32(phl, phh);


__m256i a0 = _mm256_unpackhi_epi32(phl, phh);


a = v_uint32x8(b0);


b = v_uint32x8(g0);


c = v_uint32x8(r0);


d = v_uint32x8(a0);


}


inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )


{


__m256i bgra0 = _mm256_loadu_si256((const
__m256i*)ptr);


__m256i bgra1 = _mm256_loadu_si256((const
__m256i*)(ptr + 4));


__m256i bgra2 = _mm256_loadu_si256((const
__m256i*)(ptr + 8));


__m256i bgra3 = _mm256_loadu_si256((const
__m256i*)(ptr + 12));


__m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);


__m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);


__m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);


__m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);


__m256i b0 = _mm256_unpacklo_epi64(l02, l13);


__m256i g0 = _mm256_unpackhi_epi64(l02, l13);


__m256i r0 = _mm256_unpacklo_epi64(h02, h13);


__m256i a0 = _mm256_unpackhi_epi64(h02, h13);


a = v_uint64x4(b0);


b = v_uint64x4(g0);


c = v_uint64x4(r0);


d = v_uint64x4(a0);


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& x,
const
v_uint8x32& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);


__m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);


__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);


__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, xy0);


_mm256_stream_si256((__m256i*)(ptr + 32), xy1);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, xy0);


_mm256_store_si256((__m256i*)(ptr + 32), xy1);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, xy0);


_mm256_storeu_si256((__m256i*)(ptr + 32), xy1);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& x,
const
v_uint16x16& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);


__m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);


__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);


__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, xy0);


_mm256_stream_si256((__m256i*)(ptr + 16), xy1);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, xy0);


_mm256_store_si256((__m256i*)(ptr + 16), xy1);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, xy0);


_mm256_storeu_si256((__m256i*)(ptr + 16), xy1);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& x,
const
v_uint32x8& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);


__m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);


__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);


__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, xy0);


_mm256_stream_si256((__m256i*)(ptr + 8), xy1);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, xy0);


_mm256_store_si256((__m256i*)(ptr + 8), xy1);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, xy0);


_mm256_storeu_si256((__m256i*)(ptr + 8), xy1);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& x,
const
v_uint64x4& y,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);


__m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);


__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);


__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, xy0);


_mm256_stream_si256((__m256i*)(ptr + 4), xy1);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, xy0);


_mm256_store_si256((__m256i*)(ptr + 4), xy1);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, xy0);


_mm256_storeu_si256((__m256i*)(ptr + 4), xy1);


}


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint8x32& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


const
__m256i sh_b = _mm256_setr_epi8(


0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,


0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);


const
__m256i sh_g = _mm256_setr_epi8(


5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,


5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);


const
__m256i sh_r = _mm256_setr_epi8(


10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,


10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);


__m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);


__m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);


__m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);


const
__m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,


0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);


const
__m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,


0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);


__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);


__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);


__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);


__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);


__m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);


__m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgr0);


_mm256_stream_si256((__m256i*)(ptr + 32), bgr1);


_mm256_stream_si256((__m256i*)(ptr + 64), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgr0);


_mm256_store_si256((__m256i*)(ptr + 32), bgr1);


_mm256_store_si256((__m256i*)(ptr + 64), bgr2);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgr0);


_mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);


_mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint16x16& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


const
__m256i sh_b = _mm256_setr_epi8(


0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,


0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);


const
__m256i sh_g = _mm256_setr_epi8(


10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,


10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);


const
__m256i sh_r = _mm256_setr_epi8(


4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,


4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);


__m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);


__m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);


__m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);


const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,


0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);


const
__m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,


-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);


__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);


__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);


__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);


__m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);


//__m256i bgr1 = p1;


__m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgr0);


_mm256_stream_si256((__m256i*)(ptr + 16), p1);


_mm256_stream_si256((__m256i*)(ptr + 32), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgr0);


_mm256_store_si256((__m256i*)(ptr + 16), p1);


_mm256_store_si256((__m256i*)(ptr + 32), bgr2);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgr0);


_mm256_storeu_si256((__m256i*)(ptr + 16), p1);


_mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& a,
const
v_uint32x8& b,
const
v_uint32x8& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);


__m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);


__m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);


__m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);


__m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);


__m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);


__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);


//__m256i bgr1 = p2;


__m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgr0);


_mm256_stream_si256((__m256i*)(ptr + 8), p2);


_mm256_stream_si256((__m256i*)(ptr + 16), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgr0);


_mm256_store_si256((__m256i*)(ptr + 8), p2);


_mm256_store_si256((__m256i*)(ptr + 16), bgr2);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgr0);


_mm256_storeu_si256((__m256i*)(ptr + 8), p2);


_mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& a,
const
v_uint64x4& b,
const
v_uint64x4& c,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);


__m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);


__m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);


__m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);


__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);


__m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgr0);


_mm256_stream_si256((__m256i*)(ptr + 4), bgr1);


_mm256_stream_si256((__m256i*)(ptr + 8), bgr2);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgr0);


_mm256_store_si256((__m256i*)(ptr + 4), bgr1);


_mm256_store_si256((__m256i*)(ptr + 8), bgr2);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgr0);


_mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);


_mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);


}


}


inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& a,
const
v_uint8x32& b,


const
v_uint8x32& c,
const
v_uint8x32& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);


__m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);


__m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);


__m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);


__m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);


__m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);


__m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);


__m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);


__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);


__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);


__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);


__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgra0);


_mm256_stream_si256((__m256i*)(ptr + 32), bgra1);


_mm256_stream_si256((__m256i*)(ptr + 64), bgra2);


_mm256_stream_si256((__m256i*)(ptr + 96), bgra3);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgra0);


_mm256_store_si256((__m256i*)(ptr + 32), bgra1);


_mm256_store_si256((__m256i*)(ptr + 64), bgra2);


_mm256_store_si256((__m256i*)(ptr + 96), bgra3);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgra0);


_mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);


_mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);


_mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);


}


}


inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& a,
const
v_uint16x16& b,


const
v_uint16x16& c,
const
v_uint16x16& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);


__m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);


__m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);


__m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);


__m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);


__m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);


__m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);


__m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);


__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);


__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);


__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);


__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgra0);


_mm256_stream_si256((__m256i*)(ptr + 16), bgra1);


_mm256_stream_si256((__m256i*)(ptr + 32), bgra2);


_mm256_stream_si256((__m256i*)(ptr + 48), bgra3);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgra0);


_mm256_store_si256((__m256i*)(ptr + 16), bgra1);


_mm256_store_si256((__m256i*)(ptr + 32), bgra2);


_mm256_store_si256((__m256i*)(ptr + 48), bgra3);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgra0);


_mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);


_mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);


_mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);


}


}


inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& a,
const
v_uint32x8& b,


const
v_uint32x8& c,
const
v_uint32x8& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);


__m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);


__m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);


__m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);


__m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);


__m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);


__m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);


__m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);


__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);


__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);


__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);


__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgra0);


_mm256_stream_si256((__m256i*)(ptr + 8), bgra1);


_mm256_stream_si256((__m256i*)(ptr + 16), bgra2);


_mm256_stream_si256((__m256i*)(ptr + 24), bgra3);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgra0);


_mm256_store_si256((__m256i*)(ptr + 8), bgra1);


_mm256_store_si256((__m256i*)(ptr + 16), bgra2);


_mm256_store_si256((__m256i*)(ptr + 24), bgra3);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgra0);


_mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);


_mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);


_mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);


}


}


inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& a,
const
v_uint64x4& b,


const
v_uint64x4& c,
const
v_uint64x4& d,


hal::StoreMode mode=hal::STORE_UNALIGNED )


{


__m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);


__m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);


__m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);


__m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);


__m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);


__m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);


__m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);


__m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);


if( mode == hal::STORE_ALIGNED_NOCACHE )


{


_mm256_stream_si256((__m256i*)ptr, bgra0);


_mm256_stream_si256((__m256i*)(ptr + 4), bgra1);


_mm256_stream_si256((__m256i*)(ptr + 8), bgra2);


_mm256_stream_si256((__m256i*)(ptr + 12), bgra3);


}


else
if( mode == hal::STORE_ALIGNED )


{


_mm256_store_si256((__m256i*)ptr, bgra0);


_mm256_store_si256((__m256i*)(ptr + 4), bgra1);


_mm256_store_si256((__m256i*)(ptr + 8), bgra2);


_mm256_store_si256((__m256i*)(ptr + 12), bgra3);


}


else


{


_mm256_storeu_si256((__m256i*)ptr, bgra0);


_mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);


_mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);


_mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);


}


}


#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \


{ \


_Tpvec1 a1, b1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


} \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \


{ \


_Tpvec1 a1, b1, c1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


c0 = v_reinterpret_as_##suffix0(c1); \


} \


inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \


{ \


_Tpvec1 a1, b1, c1, d1; \


v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \


a0 = v_reinterpret_as_##suffix0(a1); \


b0 = v_reinterpret_as_##suffix0(b1); \


c0 = v_reinterpret_as_##suffix0(c1); \


d0 = v_reinterpret_as_##suffix0(d1); \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \


v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \


} \


inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \


const _Tpvec0& c0, const _Tpvec0& d0, \


hal::StoreMode mode=hal::STORE_UNALIGNED ) \


{ \


_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \


_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \


_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \


_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \


v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \


}


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16,
short, s16, v_uint16x16, ushort, u16)


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8,
int, s32, v_uint32x8,
unsigned, u32)


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8,
float, f32, v_uint32x8,
unsigned, u32)


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)


OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4,
double, f64, v_uint64x4, uint64, u64)


//


// FP16


//


inline
v_float32x8 v256_load_expand(const
float16_t* ptr)


{


#if CV_FP16


return
v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const
__m128i*)ptr)));


#else


float
CV_DECL_ALIGNED(32) buf[8];


for (int
i = 0; i < 8; i++)


buf[i] = (float)ptr[i];


return v256_load_aligned(buf);


#endif


}


inline
void
v_pack_store(float16_t* ptr,
const
v_float32x8& a)


{


#if CV_FP16


__m128i ah = _mm256_cvtps_ph(a.val, 0);


_mm_storeu_si128((__m128i*)ptr, ah);


#else


float
CV_DECL_ALIGNED(32) buf[8];


v_store_aligned(buf, a);


for (int
i = 0; i < 8; i++)


ptr[i] = float16_t(buf[i]);


#endif


}


//


// end of FP16


//


inline
void
v256_cleanup() { _mm256_zeroall(); }


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END


}
// cv::


#endif

// OPENCV_HAL_INTRIN_AVX_HPP


cv::max

CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)

Calculates per-element maximum of two arrays or an array and a scalar.


cv::min

CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)

Calculates per-element minimum of two arrays or an array and a scalar.


cv::v_matmul

v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Matrix multiplication


Definition:
intrin_cpp.hpp:3196


cv::v_round

v_reg< int, n > v_round(const v_reg< float, n > &a)

Round elements


Definition:
intrin_cpp.hpp:2427


cv::v_signmask

int v_signmask(const v_reg< _Tp, n > &a)

Get negative values mask


Definition:
intrin_cpp.hpp:1395


cv::v_zip

void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)

Interleave two vectors


Definition:
intrin_cpp.hpp:1557


cv::v_dotprod_expand

v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Dot product of elements and expand


Definition:
intrin_cpp.hpp:1145


cv::v_reduce_sad

V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Sum absolute differences of values


Definition:
intrin_cpp.hpp:1377


cv::v_ceil

v_reg< int, n > v_ceil(const v_reg< float, n > &a)

Ceil elements


Definition:
intrin_cpp.hpp:2465


cv::v_store_low

void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)

Store data to memory (lower half)


Definition:
intrin_cpp.hpp:2219


cv::v_floor

v_reg< int, n > v_floor(const v_reg< float, n > &a)

Floor elements


Definition:
intrin_cpp.hpp:2452


cv::v_dotprod

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Dot product of elements


Definition:
intrin_cpp.hpp:1080


cv::v_scan_forward

int v_scan_forward(const v_reg< _Tp, n > &a)

Get first negative lane index


Definition:
intrin_cpp.hpp:1412


cv::v_reverse

v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)

Vector reverse order


Definition:
intrin_cpp.hpp:2346


cv::v_absdiff

v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Add values without saturation


Definition:
intrin_cpp.hpp:956


cv::v_reduce_sum

V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)

Element shift left among vector


Definition:
intrin_cpp.hpp:1338


cv::v_muladd

v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)

A synonym for v_fma


Definition:
intrin_cpp.hpp:1060


cv::v_trunc

v_reg< int, n > v_trunc(const v_reg< float, n > &a)

Truncate elements


Definition:
intrin_cpp.hpp:2478


cv::v_invsqrt

v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)

Inversed square root


Definition:
intrin_cpp.hpp:1010


cv::v_dotprod_expand_fast

v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Fast Dot product of elements and expand


Definition:
intrin_cpp.hpp:1188


cv::v_cvt_f64_high

CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)

Convert to double high part of vector


Definition:
intrin_cpp.hpp:2587


cv::v_reduce_sum4

v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Sums all elements of each input vector, returns the vector of sums


Definition:
intrin_cpp.hpp:1356


cv::v_mul_expand

void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)

Multiply and expand


Definition:
intrin_cpp.hpp:1219


cv::v_broadcast_element

v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)

Broadcast i-th element of vector


Definition:
intrin_cpp.hpp:2416


cv::v_expand_low

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)

Expand lower values to the wider pack type


Definition:
intrin_cpp.hpp:1499


cv::v_cvt_f64

CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)

Convert lower half to double


Definition:
intrin_cpp.hpp:2576


cv::v_expand

void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)

Expand values to the wider pack type


Definition:
intrin_cpp.hpp:1477


cv::v_pack_b

v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)

! For 16-bit boolean values


Definition:
intrin_cpp.hpp:3114


cv::v_fma

v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)

Multiply and add


Definition:
intrin_cpp.hpp:1049


cv::v_store_interleave

void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)

Interleave and store (2 channels)


Definition:
intrin_cpp.hpp:2118


cv::v_absdiffs

v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Saturating absolute difference


Definition:
intrin_cpp.hpp:997


cv::v_expand_high

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)

Expand higher values to the wider pack type


Definition:
intrin_cpp.hpp:1518


cv::v_dotprod_fast

v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Fast Dot product of elements


Definition:
intrin_cpp.hpp:1119


cv::v_mul_hi

v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

Multiply and extract high part


Definition:
intrin_cpp.hpp:1236


cv::v_cvt_f32

v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)

Convert to float


Definition:
intrin_cpp.hpp:2537


cv::v_matmuladd

v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)

Matrix multiplication and add


Definition:
intrin_cpp.hpp:3226


cv::v_extract_n

_Tp v_extract_n(const v_reg< _Tp, n > &v)

Vector extract


Definition:
intrin_cpp.hpp:2400


cv::v_not_nan

v_reg< float, n > v_not_nan(const v_reg< float, n > &a)

Less-than comparison


Definition:
intrin_cpp.hpp:893


cv::v_popcount

v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)

Count the 1 bits in the vector lanes and return result as corresponding unsigned type


Definition:
intrin_cpp.hpp:827


cv::v_store_aligned

void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)

Store data to memory (aligned)


Definition:
intrin_cpp.hpp:2254


cv::v_load_deinterleave

void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)

Load and deinterleave (2 channels)


Definition:
intrin_cpp.hpp:2046


cv::swap

CV_EXPORTS void swap(Mat &a, Mat &b)

Swaps two matrices


cv

"black box" representation of the file storage associated with a file on disk.


Definition:
aruco.hpp:75