5
#ifndef OPENCV_HAL_INTRIN_AVX_HPP
6
#define OPENCV_HAL_INTRIN_AVX_HPP
9
#define CV_SIMD256_64F 1
10
#define CV_SIMD256_FP16 0
17CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21
inline
__m256i _v256_combine(
const
__m128i& lo,
const
__m128i& hi)
22{
return
_mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
24
inline
__m256 _v256_combine(
const
__m128& lo,
const
__m128& hi)
25{
return
_mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
27
inline
__m256d _v256_combine(
const
__m128d& lo,
const
__m128d& hi)
28{
return
_mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
30
inline
int
_v_cvtsi256_si32(
const
__m256i& a)
31{
return
_mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
33
inline
__m256i _v256_shuffle_odd_64(
const
__m256i& v)
34{
return
_mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
36
inline
__m256d _v256_shuffle_odd_64(
const
__m256d& v)
37{
return
_mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
40
inline
__m256i _v256_permute2x128(
const
__m256i& a,
const
__m256i& b)
41{
return
_mm256_permute2x128_si256(a, b, imm); }
44
inline
__m256 _v256_permute2x128(
const
__m256& a,
const
__m256& b)
45{
return
_mm256_permute2f128_ps(a, b, imm); }
48
inline
__m256d _v256_permute2x128(
const
__m256d& a,
const
__m256d& b)
49{
return
_mm256_permute2f128_pd(a, b, imm); }
51
template<
int
imm,
typename
_Tpvec>
52
inline
_Tpvec v256_permute2x128(
const
_Tpvec& a,
const
_Tpvec& b)
53{
return
_Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
56
inline
__m256i _v256_permute4x64(
const
__m256i& a)
57{
return
_mm256_permute4x64_epi64(a, imm); }
60
inline
__m256d _v256_permute4x64(
const
__m256d& a)
61{
return
_mm256_permute4x64_pd(a, imm); }
63
template<
int
imm,
typename
_Tpvec>
64
inline
_Tpvec v256_permute4x64(
const
_Tpvec& a)
65{
return
_Tpvec(_v256_permute4x64<imm>(a.val)); }
67
inline
__m128i _v256_extract_high(
const
__m256i& v)
68{
return
_mm256_extracti128_si256(v, 1); }
70
inline
__m128 _v256_extract_high(
const
__m256& v)
71{
return
_mm256_extractf128_ps(v, 1); }
73
inline
__m128d _v256_extract_high(
const
__m256d& v)
74{
return
_mm256_extractf128_pd(v, 1); }
76
inline
__m128i _v256_extract_low(
const
__m256i& v)
77{
return
_mm256_castsi256_si128(v); }
79
inline
__m128 _v256_extract_low(
const
__m256& v)
80{
return
_mm256_castps256_ps128(v); }
82
inline
__m128d _v256_extract_low(
const
__m256d& v)
83{
return
_mm256_castpd256_pd128(v); }
85
inline
__m256i _v256_packs_epu32(
const
__m256i& a,
const
__m256i& b)
87
const
__m256i m = _mm256_set1_epi32(65535);
88
__m256i am = _mm256_min_epu32(a, m);
89
__m256i bm = _mm256_min_epu32(b, m);
90
return
_mm256_packus_epi32(am, bm);
94
inline
int
_v256_extract_epi8(
const
__m256i& a)
96
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
))
97
return
_mm256_extract_epi8(a, i);
99
__m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
100
return
_mm_extract_epi8(b, i & 15);
105
inline
int
_v256_extract_epi16(
const
__m256i& a)
107
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
))
108
return
_mm256_extract_epi16(a, i);
110
__m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
111
return
_mm_extract_epi16(b, i & 7);
116
inline
int
_v256_extract_epi32(
const
__m256i& a)
118
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
))
119
return
_mm256_extract_epi32(a, i);
121
__m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
122
return
_mm_extract_epi32(b, i & 3);
127
inline
int64 _v256_extract_epi64(
const
__m256i& a)
129
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910
))
130
return
_mm256_extract_epi64(a, i);
132
__m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
133
return
_mm_extract_epi64(b, i & 1);
141
typedef
uchar lane_type;
142
enum
{ nlanes = 32 };
145
explicit
v_uint8x32(__m256i v) : val(v) {}
146
v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
147
uchar v4, uchar v5, uchar v6, uchar v7,
148
uchar v8, uchar v9, uchar v10, uchar v11,
149
uchar v12, uchar v13, uchar v14, uchar v15,
150
uchar v16, uchar v17, uchar v18, uchar v19,
151
uchar v20, uchar v21, uchar v22, uchar v23,
152
uchar v24, uchar v25, uchar v26, uchar v27,
153
uchar v28, uchar v29, uchar v30, uchar v31)
155
val = _mm256_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
156
(
char)v4, (
char)v5, (
char)v6 , (
char)v7, (
char)v8, (
char)v9,
157
(
char)v10, (
char)v11, (
char)v12, (
char)v13, (
char)v14, (
char)v15,
158
(
char)v16, (
char)v17, (
char)v18, (
char)v19, (
char)v20, (
char)v21,
159
(
char)v22, (
char)v23, (
char)v24, (
char)v25, (
char)v26, (
char)v27,
160
(
char)v28, (
char)v29, (
char)v30, (
char)v31);
165
uchar get0()
const
{
return
(uchar)_v_cvtsi256_si32(val); }
170
typedef
schar lane_type;
171
enum
{ nlanes = 32 };
174
explicit
v_int8x32(__m256i v) : val(v) {}
175
v_int8x32(schar v0, schar v1, schar v2, schar v3,
176
schar v4, schar v5, schar v6, schar v7,
177
schar v8, schar v9, schar v10, schar v11,
178
schar v12, schar v13, schar v14, schar v15,
179
schar v16, schar v17, schar v18, schar v19,
180
schar v20, schar v21, schar v22, schar v23,
181
schar v24, schar v25, schar v26, schar v27,
182
schar v28, schar v29, schar v30, schar v31)
184
val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
185
v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
186
v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
191
schar get0()
const
{
return
(schar)_v_cvtsi256_si32(val); }
196
typedef
ushort lane_type;
197
enum
{ nlanes = 16 };
200
explicit
v_uint16x16(__m256i v) : val(v) {}
201
v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
202
ushort v4, ushort v5, ushort v6, ushort v7,
203
ushort v8, ushort v9, ushort v10, ushort v11,
204
ushort v12, ushort v13, ushort v14, ushort v15)
206
val = _mm256_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
207
(
short)v4, (
short)v5, (
short)v6, (
short)v7, (
short)v8, (
short)v9,
208
(
short)v10, (
short)v11, (
short)v12, (
short)v13, (
short)v14, (
short)v15);
213
ushort get0()
const
{
return
(ushort)_v_cvtsi256_si32(val); }
218
typedef
short
lane_type;
219
enum
{ nlanes = 16 };
222
explicit
v_int16x16(__m256i v) : val(v) {}
223
v_int16x16(
short
v0,
short
v1,
short
v2,
short
v3,
224
short
v4,
short
v5,
short
v6,
short
v7,
225
short
v8,
short
v9,
short
v10,
short
v11,
226
short
v12,
short
v13,
short
v14,
short
v15)
228
val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
229
v8, v9, v10, v11, v12, v13, v14, v15);
234
short
get0()
const
{
return
(
short)_v_cvtsi256_si32(val); }
239
typedef
unsigned
lane_type;
243
explicit
v_uint32x8(__m256i v) : val(v) {}
244
v_uint32x8(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3,
245
unsigned
v4,
unsigned
v5,
unsigned
v6,
unsigned
v7)
247
val = _mm256_setr_epi32((
unsigned)v0, (
unsigned)v1, (
unsigned)v2,
248
(
unsigned)v3, (
unsigned)v4, (
unsigned)v5, (
unsigned)v6, (
unsigned)v7);
253
unsigned
get0()
const
{
return
(
unsigned)_v_cvtsi256_si32(val); }
258
typedef
int
lane_type;
262
explicit
v_int32x8(__m256i v) : val(v) {}
263
v_int32x8(
int
v0,
int
v1,
int
v2,
int
v3,
264
int
v4,
int
v5,
int
v6,
int
v7)
266
val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
271
int
get0()
const
{
return
_v_cvtsi256_si32(val); }
276
typedef
float
lane_type;
280
explicit
v_float32x8(__m256 v) : val(v) {}
281
v_float32x8(
float
v0,
float
v1,
float
v2,
float
v3,
282
float
v4,
float
v5,
float
v6,
float
v7)
284
val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
289
float
get0()
const
{
return
_mm_cvtss_f32(_mm256_castps256_ps128(val)); }
294
typedef
uint64 lane_type;
298
explicit
v_uint64x4(__m256i v) : val(v) {}
299
v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
300
{ val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
306
#if defined __x86_64__ || defined _M_X64
307
return
(uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
309
int
a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
310
int
b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
311
return
(
unsigned)a | ((uint64)(
unsigned)b << 32);
318
typedef
int64 lane_type;
322
explicit
v_int64x4(__m256i v) : val(v) {}
323
v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
324
{ val = _mm256_setr_epi64x(v0, v1, v2, v3); }
330
#if defined __x86_64__ || defined _M_X64
331
return
(int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
333
int
a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
334
int
b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
335
return
(int64)((unsigned)a | ((uint64)(unsigned)b << 32));
342
typedef
double
lane_type;
346
explicit
v_float64x4(__m256d v) : val(v) {}
347
v_float64x4(
double
v0,
double
v1,
double
v2,
double
v3)
348
{ val = _mm256_setr_pd(v0, v1, v2, v3); }
352
double
get0()
const
{
return
_mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
357
#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
358
inline _Tpvec v256_load(const _Tp* ptr) \
359
{ return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
360
inline _Tpvec v256_load_aligned(const _Tp* ptr) \
361
{ return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
362
inline _Tpvec v256_load_low(const _Tp* ptr) \
364
__m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
365
return _Tpvec(_mm256_castsi128_si256(v128)); \
367
inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
369
__m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
370
__m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
371
return _Tpvec(_v256_combine(vlo, vhi)); \
373
inline void v_store(_Tp* ptr, const _Tpvec& a) \
374
{ _mm256_storeu_si256((__m256i*)ptr, a.val); } \
375
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
376
{ _mm256_store_si256((__m256i*)ptr, a.val); } \
377
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
378
{ _mm256_stream_si256((__m256i*)ptr, a.val); } \
379
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
381
if( mode == hal::STORE_UNALIGNED ) \
382
_mm256_storeu_si256((__m256i*)ptr, a.val); \
383
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
384
_mm256_stream_si256((__m256i*)ptr, a.val); \
386
_mm256_store_si256((__m256i*)ptr, a.val); \
388
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
389
{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
390
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
391
{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
393OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
394OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
395OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
396OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,
short)
397OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,
unsigned)
398OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,
int)
399OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
400OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
402
#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
403
inline _Tpvec v256_load(const _Tp* ptr) \
404
{ return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
405
inline _Tpvec v256_load_aligned(const _Tp* ptr) \
406
{ return _Tpvec(_mm256_load_##suffix(ptr)); } \
407
inline _Tpvec v256_load_low(const _Tp* ptr) \
409
return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
410
(_mm_loadu_##suffix(ptr))); \
412
inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
414
halfreg vlo = _mm_loadu_##suffix(ptr0); \
415
halfreg vhi = _mm_loadu_##suffix(ptr1); \
416
return _Tpvec(_v256_combine(vlo, vhi)); \
418
inline void v_store(_Tp* ptr, const _Tpvec& a) \
419
{ _mm256_storeu_##suffix(ptr, a.val); } \
420
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
421
{ _mm256_store_##suffix(ptr, a.val); } \
422
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
423
{ _mm256_stream_##suffix(ptr, a.val); } \
424
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
426
if( mode == hal::STORE_UNALIGNED ) \
427
_mm256_storeu_##suffix(ptr, a.val); \
428
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
429
_mm256_stream_##suffix(ptr, a.val); \
431
_mm256_store_##suffix(ptr, a.val); \
433
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
434
{ _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
435
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
436
{ _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
438OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8,
float, ps, __m128)
439OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4,
double, pd, __m128d)
441
#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
442
inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
443
{ return _Tpvec(cast(a.val)); }
445
#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
446
inline _Tpvec v256_setzero_##suffix() \
447
{ return _Tpvec(_mm256_setzero_si256()); } \
448
inline _Tpvec v256_setall_##suffix(_Tp v) \
449
{ return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
450
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
451
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
452
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
453
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
454
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
455
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
456
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
457
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
458
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
459
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
461OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8,
char)
462OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8,
char)
463OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16,
short)
464OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,
short, s16, epi16,
short)
465OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,
unsigned, u32, epi32,
int)
466OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,
int, s32, epi32,
int)
467OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
468OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
470
#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
471
inline _Tpvec v256_setzero_##suffix() \
472
{ return _Tpvec(_mm256_setzero_##zsuffix()); } \
473
inline _Tpvec v256_setall_##suffix(_Tp v) \
474
{ return _Tpvec(_mm256_set1_##zsuffix(v)); } \
475
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
476
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
477
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
478
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
479
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
480
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
481
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
482
OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
484OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8,
float, f32, ps, _mm256_castsi256_ps)
485OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4,
double, f64, pd, _mm256_castsi256_pd)
487
inline
v_float32x8 v_reinterpret_as_f32(
const
v_float32x8& a)
489
inline
v_float32x8 v_reinterpret_as_f32(
const
v_float64x4& a)
490{
return
v_float32x8(_mm256_castpd_ps(a.val)); }
492
inline
v_float64x4 v_reinterpret_as_f64(
const
v_float64x4& a)
494
inline
v_float64x4 v_reinterpret_as_f64(
const
v_float32x8& a)
495{
return
v_float64x4(_mm256_castps_pd(a.val)); }
547
#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
548
inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
549
{ return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
550
inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
551
{ return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
553OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
554OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
555OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
556OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
557OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
558OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
559OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
560OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
561OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
562OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
565
#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
567
inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
568
{ return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
570OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
571OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
572OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
573OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
574OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
575OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
578
inline
v_uint64x4 v256_blend(
const
v_uint64x4& a,
const
v_uint64x4& b)
581
enum
{M1 = (M0 | (M0 << 2)) & 0x33};
582
enum
{M2 = (M1 | (M1 << 1)) & 0x55};
583
enum
{MM = M2 | (M2 << 1)};
584
return
v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
587
inline
v_int64x4 v256_blend(
const
v_int64x4& a,
const
v_int64x4& b)
588{
return
v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
592
#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
594
inline _Tpvec v256_shuffle(const _Tpvec& a) \
595
{ return _Tpvec(_mm256_##intrin(a.val, m)); }
597OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
598OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
599OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
600OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
602
template<
typename
_Tpvec>
603
inline
void
v256_zip(
const
_Tpvec& a,
const
_Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
605
ab0 = v256_unpacklo(a, b);
606
ab1 = v256_unpackhi(a, b);
609
template<
typename
_Tpvec>
610
inline
_Tpvec v256_combine_diagonal(
const
_Tpvec& a,
const
_Tpvec& b)
611{
return
_Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
613
inline
v_float32x8 v256_combine_diagonal(
const
v_float32x8& a,
const
v_float32x8& b)
614{
return
v256_blend<0xf0>(a, b); }
616
inline
v_float64x4 v256_combine_diagonal(
const
v_float64x4& a,
const
v_float64x4& b)
617{
return
v256_blend<0xc>(a, b); }
619
template<
typename
_Tpvec>
620
inline
_Tpvec v256_alignr_128(
const
_Tpvec& a,
const
_Tpvec& b)
621{
return
v256_permute2x128<0x21>(a, b); }
623
template<
typename
_Tpvec>
624
inline
_Tpvec v256_alignr_64(
const
_Tpvec& a,
const
_Tpvec& b)
625{
return
_Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
626
inline
v_float64x4 v256_alignr_64(
const
v_float64x4& a,
const
v_float64x4& b)
627{
return
v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
630
template<
typename
_Tpvec>
631
inline
_Tpvec v256_swap_halves(
const
_Tpvec& a)
632{
return
v256_permute2x128<1>(a, a); }
634
template<
typename
_Tpvec>
635
inline
_Tpvec v256_reverse_64(
const
_Tpvec& a)
636{
return
v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
639
#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
640
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
641
{ return v256_permute2x128<0x20>(a, b); } \
642
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
643
{ return v256_permute2x128<0x31>(a, b); } \
644
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
645
_Tpvec& c, _Tpvec& d) \
647
_Tpvec a1b0 = v256_alignr_128(a, b); \
648
c = v256_combine_diagonal(a, a1b0); \
649
d = v256_combine_diagonal(a1b0, b); \
651
inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
652
_Tpvec& ab0, _Tpvec& ab1) \
654
_Tpvec ab0ab2, ab1ab3; \
655
v256_zip(a, b, ab0ab2, ab1ab3); \
656
v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
659OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
660OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
661OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
662OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
663OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
664OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
665OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
666OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
667OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
668OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
675
#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
676
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
677
{ return _Tpvec(intrin(a.val, b.val)); } \
678
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
679
{ a.val = intrin(a.val, b.val); return a; }
681OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
682OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
683OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
684OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
685OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
686OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
687OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
688OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
689OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
690OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
691OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
692OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
693OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
694OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
695OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
696OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
697OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
698OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
700OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
701OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
702OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
703OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
704OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
705OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
706OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
707OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
710
inline
v_uint8x32 operator * (
const
v_uint8x32& a,
const
v_uint8x32& b)
716
inline
v_int8x32 operator * (
const
v_int8x32& a,
const
v_int8x32& b)
722
inline
v_uint16x16 operator * (
const
v_uint16x16& a,
const
v_uint16x16& b)
724
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
725
__m256i ph = _mm256_mulhi_epu16(a.val, b.val);
726
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
727
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
728
return
v_uint16x16(_v256_packs_epu32(p0, p1));
730
inline
v_int16x16 operator * (
const
v_int16x16& a,
const
v_int16x16& b)
732
__m256i pl = _mm256_mullo_epi16(a.val, b.val);
733
__m256i ph = _mm256_mulhi_epi16(a.val, b.val);
734
__m256i p0 = _mm256_unpacklo_epi16(pl, ph);
735
__m256i p1 = _mm256_unpackhi_epi16(pl, ph);
736
return
v_int16x16(_mm256_packs_epi32(p0, p1));
738
inline
v_uint8x32& operator *= (v_uint8x32& a,
const
v_uint8x32& b)
739{ a = a * b;
return
a; }
740
inline
v_int8x32& operator *= (v_int8x32& a,
const
v_int8x32& b)
741{ a = a * b;
return
a; }
742
inline
v_uint16x16& operator *= (v_uint16x16& a,
const
v_uint16x16& b)
743{ a = a * b;
return
a; }
744
inline
v_int16x16& operator *= (v_int16x16& a,
const
v_int16x16& b)
745{ a = a * b;
return
a; }
748
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
749
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
750
{ return _Tpvec(intrin(a.val, b.val)); }
752OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
753OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
754OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
755OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
756OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
757OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
758OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
759OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
760OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
761OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
763
inline
v_uint8x32 v_mul_wrap(
const
v_uint8x32& a,
const
v_uint8x32& b)
765
__m256i ad = _mm256_srai_epi16(a.val, 8);
766
__m256i bd = _mm256_srai_epi16(b.val, 8);
767
__m256i p0 = _mm256_mullo_epi16(a.val, b.val);
768
__m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8);
770
const
__m256i b01 = _mm256_set1_epi32(0xFF00FF00);
771
return
v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
773
inline
v_int8x32 v_mul_wrap(
const
v_int8x32& a,
const
v_int8x32& b)
775
return
v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
779
inline
void
v_mul_expand(
const
v_uint8x32& a,
const
v_uint8x32& b,
780
v_uint16x16& c, v_uint16x16& d)
782
v_uint16x16 a0, a1, b0, b1;
785
c = v_mul_wrap(a0, b0);
786
d = v_mul_wrap(a1, b1);
789
inline
void
v_mul_expand(
const
v_int8x32& a,
const
v_int8x32& b,
790
v_int16x16& c, v_int16x16& d)
792
v_int16x16 a0, a1, b0, b1;
795
c = v_mul_wrap(a0, b0);
796
d = v_mul_wrap(a1, b1);
799
inline
void
v_mul_expand(
const
v_int16x16& a,
const
v_int16x16& b,
800
v_int32x8& c, v_int32x8& d)
802
v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
805
v_zip(v_mul_wrap(a, b), vhi, v0, v1);
807
c = v_reinterpret_as_s32(v0);
808
d = v_reinterpret_as_s32(v1);
811
inline
void
v_mul_expand(
const
v_uint16x16& a,
const
v_uint16x16& b,
812
v_uint32x8& c, v_uint32x8& d)
814
v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
817
v_zip(v_mul_wrap(a, b), vhi, v0, v1);
819
c = v_reinterpret_as_u32(v0);
820
d = v_reinterpret_as_u32(v1);
823
inline
void
v_mul_expand(
const
v_uint32x8& a,
const
v_uint32x8& b,
824
v_uint64x4& c, v_uint64x4& d)
826
__m256i v0 = _mm256_mul_epu32(a.val, b.val);
827
__m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
828
v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
831
inline
v_int16x16
v_mul_hi(
const
v_int16x16& a,
const
v_int16x16& b) {
return
v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
832
inline
v_uint16x16
v_mul_hi(
const
v_uint16x16& a,
const
v_uint16x16& b) {
return
v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
835
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
836
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
837
{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
838
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
839
{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
840
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
841
{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
842
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
843
{ return _Tpsvec(srai(a.val, imm)); } \
845
inline _Tpuvec v_shl(const _Tpuvec& a) \
846
{ return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
848
inline _Tpsvec v_shl(const _Tpsvec& a) \
849
{ return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
851
inline _Tpuvec v_shr(const _Tpuvec& a) \
852
{ return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
854
inline _Tpsvec v_shr(const _Tpsvec& a) \
855
{ return _Tpsvec(srai(a.val, imm)); }
857OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
858OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
860
inline
__m256i _mm256_srai_epi64xx(
const
__m256i a,
int
imm)
862
__m256i d = _mm256_set1_epi64x((int64)1 << 63);
863
__m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
864
return
_mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
866OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
870
#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
871
OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
872
OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
873
OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
874
inline _Tpvec operator ~ (const _Tpvec& a) \
875
{ return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
877OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
878OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
879OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
880OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
881OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
882OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
883OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
884OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
885OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
886OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
889
#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
890
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
891
{ return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
893OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
894OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
895OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
896OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
897OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
898OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
899OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
900OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
903
#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
904
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
905
{ return ~(a == b); } \
906
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
908
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
909
{ return ~(a < b); } \
910
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
913
#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
914
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
915
{ return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
916
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
918
__m256i smask = _mm256_set1_##suffix(sbit); \
919
return _Tpuvec(_mm256_cmpgt_##suffix( \
920
_mm256_xor_si256(a.val, smask), \
921
_mm256_xor_si256(b.val, smask))); \
923
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
924
{ return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
925
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
926
{ return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
927
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
928
OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
930OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (
char)-128)
931OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (
short)-32768)
932OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (
int)0x80000000)
934
#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
935
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
936
{ return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
937
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
938
{ return ~(a == b); }
940OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
941OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
943
#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
944
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
945
{ return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
947
#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
948
OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
949
OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
950
OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
951
OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
952
OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
953
OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
955OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
956OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
958
inline
v_float32x8
v_not_nan(
const
v_float32x8& a)
959{
return
v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
960
inline
v_float64x4
v_not_nan(
const
v_float64x4& a)
961{
return
v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
964OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
965OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
966OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
967OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
968OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
969OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
970OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
971OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
972OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
973OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
974OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
975OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
976OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
977OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
978OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
979OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
983
inline
v_uint8x32 v_rotate_left(
const
v_uint8x32& a,
const
v_uint8x32& b)
985
enum
{IMM_R = (16 - imm) & 0xFF};
986
enum
{IMM_R2 = (32 - imm) & 0xFF};
988
if
(imm == 0)
return
a;
989
if
(imm == 32)
return
b;
990
if
(imm > 32)
return
v_uint8x32();
992
__m256i
swap
= _mm256_permute2x128_si256(a.val, b.val, 0x03);
993
if
(imm == 16)
return
v_uint8x32(
swap);
994
if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(a.val,
swap, IMM_R));
995
return
v_uint8x32(_mm256_alignr_epi8(
swap, b.val, IMM_R2));
999
inline
v_uint8x32 v_rotate_right(
const
v_uint8x32& a,
const
v_uint8x32& b)
1001
enum
{IMM_L = (imm - 16) & 0xFF};
1003
if
(imm == 0)
return
a;
1004
if
(imm == 32)
return
b;
1005
if
(imm > 32)
return
v_uint8x32();
1007
__m256i
swap
= _mm256_permute2x128_si256(a.val, b.val, 0x21);
1008
if
(imm == 16)
return
v_uint8x32(
swap);
1009
if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(
swap, a.val, imm));
1010
return
v_uint8x32(_mm256_alignr_epi8(b.val,
swap, IMM_L));
1014
inline
v_uint8x32 v_rotate_left(
const
v_uint8x32& a)
1016
enum
{IMM_L = (imm - 16) & 0xFF};
1017
enum
{IMM_R = (16 - imm) & 0xFF};
1019
if
(imm == 0)
return
a;
1020
if
(imm > 32)
return
v_uint8x32();
1023
__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
1024
if
(imm == 16)
return
v_uint8x32(swapz);
1025
if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
1026
return
v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
1030
inline
v_uint8x32 v_rotate_right(
const
v_uint8x32& a)
1032
enum
{IMM_L = (imm - 16) & 0xFF};
1034
if
(imm == 0)
return
a;
1035
if
(imm > 32)
return
v_uint8x32();
1038
__m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
1039
if
(imm == 16)
return
v_uint8x32(swapz);
1040
if
(imm < 16)
return
v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
1041
return
v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
1044
#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
1046
inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1048
enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1049
v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1050
v_reinterpret_as_u8(b)); \
1051
return _Tpvec(cast(ret.val)); \
1054
inline _Tpvec intrin(const _Tpvec& a) \
1056
enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1057
v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1058
return _Tpvec(cast(ret.val)); \
1061
#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
1062
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1063
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1065OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
1066OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
1067OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
1068OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
1069OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
1070OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
1071OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
1073OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
1074OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
1075OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
1076OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
1079
inline
v_uint8x32
v_reverse(
const
v_uint8x32 &a)
1081
static
const
__m256i perm = _mm256_setr_epi8(
1082
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1083
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1084
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
1085
return
v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
1088
inline
v_int8x32
v_reverse(
const
v_int8x32 &a)
1089{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1091
inline
v_uint16x16
v_reverse(
const
v_uint16x16 &a)
1093
static
const
__m256i perm = _mm256_setr_epi8(
1094
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
1095
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1096
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
1097
return
v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
1100
inline
v_int16x16
v_reverse(
const
v_int16x16 &a)
1101{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1103
inline
v_uint32x8
v_reverse(
const
v_uint32x8 &a)
1105
static
const
__m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1106
return
v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
1109
inline
v_int32x8
v_reverse(
const
v_int32x8 &a)
1110{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1112
inline
v_float32x8
v_reverse(
const
v_float32x8 &a)
1113{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1115
inline
v_uint64x4
v_reverse(
const
v_uint64x4 &a)
1117
return
v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1120
inline
v_int64x4
v_reverse(
const
v_int64x4 &a)
1121{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1123
inline
v_float64x4
v_reverse(
const
v_float64x4 &a)
1124{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1131
__m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
1132
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1133
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1137
__m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
1138
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1139
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
1141
#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1142
inline sctype v_reduce_##func(const _Tpvec& a) \
1144
__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1145
val = intrin(val, _mm_srli_si128(val,8)); \
1146
val = intrin(val, _mm_srli_si128(val,4)); \
1147
val = intrin(val, _mm_srli_si128(val,2)); \
1148
val = intrin(val, _mm_srli_si128(val,1)); \
1149
return (sctype)_mm_cvtsi128_si32(val); \
1152OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar,
min, _mm_min_epu8)
1153OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar,
min, _mm_min_epi8)
1154OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar,
max, _mm_max_epu8)
1155OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar,
max, _mm_max_epi8)
1157
#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1158
inline sctype v_reduce_##func(const _Tpvec& a) \
1160
__m128i v0 = _v256_extract_low(a.val); \
1161
__m128i v1 = _v256_extract_high(a.val); \
1162
v0 = intrin(v0, v1); \
1163
v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1164
v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1165
v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
1166
return (sctype) _mm_cvtsi128_si32(v0); \
1169OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort,
min, _mm_min_epu16)
1170OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
min, _mm_min_epi16)
1171OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort,
max, _mm_max_epu16)
1172OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
max, _mm_max_epi16)
1174
#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1175
inline sctype v_reduce_##func(const _Tpvec& a) \
1177
__m128i v0 = _v256_extract_low(a.val); \
1178
__m128i v1 = _v256_extract_high(a.val); \
1179
v0 = intrin(v0, v1); \
1180
v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1181
v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1182
return (sctype) _mm_cvtsi128_si32(v0); \
1185OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
min, _mm_min_epu32)
1186OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
min, _mm_min_epi32)
1187OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
max, _mm_max_epu32)
1188OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
max, _mm_max_epi32)
1190
#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
1191
inline float v_reduce_##func(const v_float32x8& a) \
1193
__m128 v0 = _v256_extract_low(a.val); \
1194
__m128 v1 = _v256_extract_high(a.val); \
1195
v0 = intrin(v0, v1); \
1196
v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
1197
v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
1198
return _mm_cvtss_f32(v0); \
1201OPENCV_HAL_IMPL_AVX_REDUCE_FLT(
min, _mm_min_ps)
1202OPENCV_HAL_IMPL_AVX_REDUCE_FLT(
max, _mm_max_ps)
1206
__m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1207
s0 = _mm256_hadd_epi32(s0, s0);
1209
__m128i s1 = _v256_extract_high(s0);
1210
s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1212
return
_mm_cvtsi128_si32(s1);
1225
__m256 s0 = _mm256_hadd_ps(a.val, a.val);
1226
s0 = _mm256_hadd_ps(s0, s0);
1228
__m128 s1 = _v256_extract_high(s0);
1229
s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1231
return
_mm_cvtss_f32(s1);
1236
uint64 CV_DECL_ALIGNED(32) idx[2];
1237
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1238
return idx[0] + idx[1];
1242
int64 CV_DECL_ALIGNED(32) idx[2];
1243
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1244
return idx[0] + idx[1];
1248
__m256d s0 = _mm256_hadd_pd(a.val, a.val);
1249
return
_mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
1252
inline
v_float32x8
v_reduce_sum4(
const
v_float32x8& a,
const
v_float32x8& b,
1253
const
v_float32x8& c,
const
v_float32x8& d)
1255
__m256 ab = _mm256_hadd_ps(a.val, b.val);
1256
__m256 cd = _mm256_hadd_ps(c.val, d.val);
1257
return
v_float32x8(_mm256_hadd_ps(ab, cd));
1260
inline
unsigned
v_reduce_sad(
const
v_uint8x32& a,
const
v_uint8x32& b)
1262
__m256i half = _mm256_sad_epu8(a.val, b.val);
1263
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1264
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1266
inline
unsigned
v_reduce_sad(
const
v_int8x32& a,
const
v_int8x32& b)
1268
__m256i half = _mm256_set1_epi8(0x7f);
1269
half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
1270
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1271
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1273
inline
unsigned
v_reduce_sad(
const
v_uint16x16& a,
const
v_uint16x16& b)
1276
v_expand(v_add_wrap(a - b, b - a), l, h);
1279
inline
unsigned
v_reduce_sad(
const
v_int16x16& a,
const
v_int16x16& b)
1282
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1285
inline
unsigned
v_reduce_sad(
const
v_uint32x8& a,
const
v_uint32x8& b)
1289
inline
unsigned
v_reduce_sad(
const
v_int32x8& a,
const
v_int32x8& b)
1291
v_int32x8 m = a < b;
1292
return
v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1294
inline
float
v_reduce_sad(
const
v_float32x8& a,
const
v_float32x8& b)
1296
return
v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
1300
inline
v_uint8x32
v_popcount(
const
v_uint8x32& a)
1302
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1303
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
1304
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
1305
return
v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
1306
_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
1308
inline
v_uint16x16
v_popcount(
const
v_uint16x16& a)
1310
v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));
1311
p += v_rotate_right<1>(p);
1312
return
v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
1314
inline
v_uint32x8
v_popcount(
const
v_uint32x8& a)
1316
v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));
1317
p += v_rotate_right<1>(p);
1318
p += v_rotate_right<2>(p);
1319
return
v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
1321
inline
v_uint64x4
v_popcount(
const
v_uint64x4& a)
1323
return
v_uint64x4(_mm256_sad_epu8(
v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
1325
inline
v_uint8x32
v_popcount(
const
v_int8x32& a)
1326{
return
v_popcount(v_reinterpret_as_u8(a)); }
1327
inline
v_uint16x16
v_popcount(
const
v_int16x16& a)
1328{
return
v_popcount(v_reinterpret_as_u16(a)); }
1329
inline
v_uint32x8
v_popcount(
const
v_int32x8& a)
1330{
return
v_popcount(v_reinterpret_as_u32(a)); }
1331
inline
v_uint64x4
v_popcount(
const
v_int64x4& a)
1332{
return
v_popcount(v_reinterpret_as_u64(a)); }
1336{
return
_mm256_movemask_epi8(a.val); }
1338{
return
v_signmask(v_reinterpret_as_s8(a)); }
1343{
return
v_signmask(v_reinterpret_as_s16(a)); }
1346{
return
_mm256_movemask_ps(a.val); }
1348{
return
_mm256_movemask_pd(a.val); }
1351{
return
v_signmask(v_reinterpret_as_f32(a)); }
1353{
return
v_signmask(v_reinterpret_as_f32(a)); }
1356{
return
v_signmask(v_reinterpret_as_f64(a)); }
1358{
return
v_signmask(v_reinterpret_as_f64(a)); }
1363
inline
int
v_scan_forward(
const
v_uint16x16& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 2; }
1366
inline
int
v_scan_forward(
const
v_float32x8& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 4; }
1369
inline
int
v_scan_forward(
const
v_float64x4& a) {
return
trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 8; }
1372
#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
1373
inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1374
inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1375OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
1376OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
1377OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
1378OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
1379OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
1380OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
1381OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
1382OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
1384
#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
1385
inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1386
inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1387OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
1388OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
1393
#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1394
inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1395
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1396
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1397
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1398
inline _Tpvec v_sqrt(const _Tpvec& x) \
1399
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1400
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1401
{ return v_fma(a, a, b * b); } \
1402
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1403
{ return v_sqrt(v_fma(a, a, b*b)); }
1405OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1406OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1408
inline
v_int32x8
v_fma(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_int32x8& c)
1413
inline
v_int32x8
v_muladd(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_int32x8& c)
1415
return
v_fma(a, b, c);
1418
inline
v_float32x8
v_invsqrt(
const
v_float32x8& x)
1420
v_float32x8 half = x * v256_setall_f32(0.5);
1421
v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1423
t *= v256_setall_f32(1.5) - ((t * t) * half);
1427
inline
v_float64x4
v_invsqrt(
const
v_float64x4& x)
1429
return
v256_setall_f64(1.) / v_sqrt(x);
1433
#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1434
inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1435
{ return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1437OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1438OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1439OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1441
inline
v_float32x8 v_abs(
const
v_float32x8& x)
1442{
return
x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1443
inline
v_float64x4 v_abs(
const
v_float64x4& x)
1444{
return
x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1447
inline
v_uint8x32
v_absdiff(
const
v_uint8x32& a,
const
v_uint8x32& b)
1448{
return
v_add_wrap(a - b, b - a); }
1449
inline
v_uint16x16
v_absdiff(
const
v_uint16x16& a,
const
v_uint16x16& b)
1450{
return
v_add_wrap(a - b, b - a); }
1451
inline
v_uint32x8
v_absdiff(
const
v_uint32x8& a,
const
v_uint32x8& b)
1452{
return
v_max(a, b) - v_min(a, b); }
1454
inline
v_uint8x32
v_absdiff(
const
v_int8x32& a,
const
v_int8x32& b)
1456
v_int8x32 d = v_sub_wrap(a, b);
1457
v_int8x32 m = a < b;
1458
return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1461
inline
v_uint16x16
v_absdiff(
const
v_int16x16& a,
const
v_int16x16& b)
1462{
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1464
inline
v_uint32x8
v_absdiff(
const
v_int32x8& a,
const
v_int32x8& b)
1466
v_int32x8 d = a - b;
1467
v_int32x8 m = a < b;
1468
return
v_reinterpret_as_u32((d ^ m) - m);
1471
inline
v_float32x8
v_absdiff(
const
v_float32x8& a,
const
v_float32x8& b)
1472{
return
v_abs(a - b); }
1474
inline
v_float64x4
v_absdiff(
const
v_float64x4& a,
const
v_float64x4& b)
1475{
return
v_abs(a - b); }
1478
inline
v_int8x32
v_absdiffs(
const
v_int8x32& a,
const
v_int8x32& b)
1480
v_int8x32 d = a - b;
1481
v_int8x32 m = a < b;
1484
inline
v_int16x16
v_absdiffs(
const
v_int16x16& a,
const
v_int16x16& b)
1485{
return
v_max(a, b) - v_min(a, b); }
1490
inline
v_int32x8
v_round(
const
v_float32x8& a)
1491{
return
v_int32x8(_mm256_cvtps_epi32(a.val)); }
1493
inline
v_int32x8
v_round(
const
v_float64x4& a)
1494{
return
v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1496
inline
v_int32x8
v_round(
const
v_float64x4& a,
const
v_float64x4& b)
1498
__m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
1499
return
v_int32x8(_v256_combine(ai, bi));
1502
inline
v_int32x8
v_trunc(
const
v_float32x8& a)
1503{
return
v_int32x8(_mm256_cvttps_epi32(a.val)); }
1505
inline
v_int32x8
v_trunc(
const
v_float64x4& a)
1506{
return
v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1508
inline
v_int32x8
v_floor(
const
v_float32x8& a)
1509{
return
v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1511
inline
v_int32x8
v_floor(
const
v_float64x4& a)
1512{
return
v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1514
inline
v_int32x8
v_ceil(
const
v_float32x8& a)
1515{
return
v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1517
inline
v_int32x8
v_ceil(
const
v_float64x4& a)
1518{
return
v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1521
inline
v_float32x8
v_cvt_f32(
const
v_int32x8& a)
1522{
return
v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1524
inline
v_float32x8
v_cvt_f32(
const
v_float64x4& a)
1525{
return
v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1527
inline
v_float32x8
v_cvt_f32(
const
v_float64x4& a,
const
v_float64x4& b)
1529
__m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1530
return
v_float32x8(_v256_combine(af, bf));
1533
inline
v_float64x4
v_cvt_f64(
const
v_int32x8& a)
1534{
return
v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1537{
return
v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1539
inline
v_float64x4
v_cvt_f64(
const
v_float32x8& a)
1540{
return
v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1543{
return
v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1546
inline
v_float64x4
v_cvt_f64(
const
v_int64x4& v)
1549
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000);
1550
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000);
1551
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000);
1552
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
1555
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
1557
__m256i v_hi = _mm256_srli_epi64(v.val, 32);
1559
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
1561
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
1563
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
1564
return
v_float64x4(result);
1569
inline
v_int8x32 v256_lut(
const
schar* tab,
const
int* idx)
1571
return
v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1572
tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
1573
tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1574
tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
1576
inline
v_int8x32 v256_lut_pairs(
const
schar* tab,
const
int* idx)
1578
return
v_int8x32(_mm256_setr_epi16(*(
const
short*)(tab + idx[ 0]), *(
const
short*)(tab + idx[ 1]), *(
const
short*)(tab + idx[ 2]), *(
const
short*)(tab + idx[ 3]),
1579
*(
const
short*)(tab + idx[ 4]), *(
const
short*)(tab + idx[ 5]), *(
const
short*)(tab + idx[ 6]), *(
const
short*)(tab + idx[ 7]),
1580
*(
const
short*)(tab + idx[ 8]), *(
const
short*)(tab + idx[ 9]), *(
const
short*)(tab + idx[10]), *(
const
short*)(tab + idx[11]),
1581
*(
const
short*)(tab + idx[12]), *(
const
short*)(tab + idx[13]), *(
const
short*)(tab + idx[14]), *(
const
short*)(tab + idx[15])));
1583
inline
v_int8x32 v256_lut_quads(
const
schar* tab,
const
int* idx)
1585
return
v_int8x32(_mm256_i32gather_epi32((
const
int*)tab, _mm256_loadu_si256((
const
__m256i*)idx), 1));
1587
inline
v_uint8x32 v256_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut((
const
schar *)tab, idx)); }
1588
inline
v_uint8x32 v256_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut_pairs((
const
schar *)tab, idx)); }
1589
inline
v_uint8x32 v256_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v256_lut_quads((
const
schar *)tab, idx)); }
1591
inline
v_int16x16 v256_lut(
const
short* tab,
const
int* idx)
1593
return
v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1594
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
1596
inline
v_int16x16 v256_lut_pairs(
const
short* tab,
const
int* idx)
1598
return
v_int16x16(_mm256_i32gather_epi32((
const
int*)tab, _mm256_loadu_si256((
const
__m256i*)idx), 2));
1600
inline
v_int16x16 v256_lut_quads(
const
short* tab,
const
int* idx)
1602
#if defined(__GNUC__)
1603
return
v_int16x16(_mm256_i32gather_epi64((
const
long
long
int*)tab, _mm_loadu_si128((
const
__m128i*)idx), 2));
1605
return
v_int16x16(_mm256_i32gather_epi64((
const
int64*)tab, _mm_loadu_si128((
const
__m128i*)idx), 2));
1608
inline
v_uint16x16 v256_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut((
const
short
*)tab, idx)); }
1609
inline
v_uint16x16 v256_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut_pairs((
const
short
*)tab, idx)); }
1610
inline
v_uint16x16 v256_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v256_lut_quads((
const
short
*)tab, idx)); }
1612
inline
v_int32x8 v256_lut(
const
int* tab,
const
int* idx)
1614
return
v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((
const
__m256i*)idx), 4));
1616
inline
v_int32x8 v256_lut_pairs(
const
int* tab,
const
int* idx)
1618
#if defined(__GNUC__)
1619
return
v_int32x8(_mm256_i32gather_epi64((
const
long
long
int*)tab, _mm_loadu_si128((
const
__m128i*)idx), 4));
1621
return
v_int32x8(_mm256_i32gather_epi64((
const
int64*)tab, _mm_loadu_si128((
const
__m128i*)idx), 4));
1624
inline
v_int32x8 v256_lut_quads(
const
int* tab,
const
int* idx)
1626
return
v_int32x8(_v256_combine(_mm_loadu_si128((
const
__m128i*)(tab + idx[0])), _mm_loadu_si128((
const
__m128i*)(tab + idx[1]))));
1628
inline
v_uint32x8 v256_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut((
const
int
*)tab, idx)); }
1629
inline
v_uint32x8 v256_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut_pairs((
const
int
*)tab, idx)); }
1630
inline
v_uint32x8 v256_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v256_lut_quads((
const
int
*)tab, idx)); }
1632
inline
v_int64x4 v256_lut(
const
int64* tab,
const
int* idx)
1634
#if defined(__GNUC__)
1635
return
v_int64x4(_mm256_i32gather_epi64((
const
long
long
int*)tab, _mm_loadu_si128((
const
__m128i*)idx), 8));
1637
return
v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((
const
__m128i*)idx), 8));
1640
inline
v_int64x4 v256_lut_pairs(
const
int64* tab,
const
int* idx)
1642
return
v_int64x4(_v256_combine(_mm_loadu_si128((
const
__m128i*)(tab + idx[0])), _mm_loadu_si128((
const
__m128i*)(tab + idx[1]))));
1644
inline
v_uint64x4 v256_lut(
const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v256_lut((
const
int64 *)tab, idx)); }
1645
inline
v_uint64x4 v256_lut_pairs(
const
uint64* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v256_lut_pairs((
const
int64 *)tab, idx)); }
1647
inline
v_float32x8 v256_lut(
const
float* tab,
const
int* idx)
1649
return
v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((
const
__m256i*)idx), 4));
1651
inline
v_float32x8 v256_lut_pairs(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v256_lut_pairs((
const
int
*)tab, idx)); }
1652
inline
v_float32x8 v256_lut_quads(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v256_lut_quads((
const
int
*)tab, idx)); }
1654
inline
v_float64x4 v256_lut(
const
double* tab,
const
int* idx)
1656
return
v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((
const
__m128i*)idx), 8));
1658
inline
v_float64x4 v256_lut_pairs(
const
double* tab,
const
int* idx) {
return
v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
1660
inline
v_int32x8 v_lut(
const
int* tab,
const
v_int32x8& idxvec)
1662
return
v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
1665
inline
v_uint32x8 v_lut(
const
unsigned* tab,
const
v_int32x8& idxvec)
1667
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idxvec));
1670
inline
v_float32x8 v_lut(
const
float* tab,
const
v_int32x8& idxvec)
1672
return
v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
1675
inline
v_float64x4 v_lut(
const
double* tab,
const
v_int32x8& idxvec)
1677
return
v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
1680
inline
void
v_lut_deinterleave(
const
float* tab,
const
v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1682
int
CV_DECL_ALIGNED(32) idx[8];
1684
__m128 z = _mm_setzero_ps();
1685
__m128 xy01, xy45, xy23, xy67;
1686
xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1687
xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1688
xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1689
xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1690
__m256 xy0145 = _v256_combine(xy01, xy45);
1691
xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1692
xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1693
xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1694
xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1695
__m256 xy2367 = _v256_combine(xy23, xy67);
1697
__m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1698
__m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1700
x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1701
y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1704inline
void
v_lut_deinterleave(const
double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1706
int
CV_DECL_ALIGNED(32) idx[4];
1708
__m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1709
__m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1710
__m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1711
__m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1712
__m256d xy02 = _v256_combine(xy0, xy2);
1713
__m256d xy13 = _v256_combine(xy1, xy3);
1715
x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1716
y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1719inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1721
return
v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1723
inline
v_uint8x32 v_interleave_pairs(
const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1724
inline
v_int8x32 v_interleave_quads(
const
v_int8x32& vec)
1726
return
v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1728
inline
v_uint8x32 v_interleave_quads(
const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1730
inline
v_int16x16 v_interleave_pairs(
const
v_int16x16& vec)
1732
return
v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1734
inline
v_uint16x16 v_interleave_pairs(
const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1735
inline
v_int16x16 v_interleave_quads(
const
v_int16x16& vec)
1737
return
v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1739
inline
v_uint16x16 v_interleave_quads(
const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1741
inline
v_int32x8 v_interleave_pairs(
const
v_int32x8& vec)
1743
return
v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
1745
inline
v_uint32x8 v_interleave_pairs(
const
v_uint32x8& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1746
inline
v_float32x8 v_interleave_pairs(
const
v_float32x8& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1748
inline
v_int8x32 v_pack_triplets(
const
v_int8x32& vec)
1750
return
v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
1751
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1753
inline
v_uint8x32 v_pack_triplets(
const
v_uint8x32& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1755
inline
v_int16x16 v_pack_triplets(
const
v_int16x16& vec)
1757
return
v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
1758
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1760
inline
v_uint16x16 v_pack_triplets(
const
v_uint16x16& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1762
inline
v_int32x8 v_pack_triplets(
const
v_int32x8& vec)
1764
return
v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1766
inline
v_uint32x8 v_pack_triplets(
const
v_uint32x8& vec) {
return
v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1767
inline
v_float32x8 v_pack_triplets(
const
v_float32x8& vec)
1769
return
v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1777
inline
v_int32x8
v_dotprod(
const
v_int16x16& a,
const
v_int16x16& b)
1778{
return
v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1779
inline
v_int32x8
v_dotprod(
const
v_int16x16& a,
const
v_int16x16& b,
const
v_int32x8& c)
1783
inline
v_int64x4
v_dotprod(
const
v_int32x8& a,
const
v_int32x8& b)
1785
__m256i even = _mm256_mul_epi32(a.val, b.val);
1786
__m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
1787
return
v_int64x4(_mm256_add_epi64(even, odd));
1789
inline
v_int64x4
v_dotprod(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_int64x4& c)
1793
inline
v_uint32x8
v_dotprod_expand(
const
v_uint8x32& a,
const
v_uint8x32& b)
1795
__m256i even_m = _mm256_set1_epi32(0xFF00FF00);
1796
__m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
1797
__m256i odd_a = _mm256_srli_epi16(a.val, 8);
1799
__m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
1800
__m256i odd_b = _mm256_srli_epi16(b.val, 8);
1802
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1803
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1804
return
v_uint32x8(_mm256_add_epi32(prod0, prod1));
1806
inline
v_uint32x8
v_dotprod_expand(
const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint32x8& c)
1811
__m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
1812
__m256i odd_a = _mm256_srai_epi16(a.val, 8);
1814
__m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
1815
__m256i odd_b = _mm256_srai_epi16(b.val, 8);
1817
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1818
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1819
return
v_int32x8(_mm256_add_epi32(prod0, prod1));
1821
inline
v_int32x8
v_dotprod_expand(
const
v_int8x32& a,
const
v_int8x32& b,
const
v_int32x8& c)
1825
inline
v_uint64x4
v_dotprod_expand(
const
v_uint16x16& a,
const
v_uint16x16& b)
1827
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1828
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1829
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1830
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1832
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1833
__m256i p13 = _mm256_srli_epi64(mul0, 32);
1834
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1835
__m256i p57 = _mm256_srli_epi64(mul1, 32);
1837
__m256i p15_ = _mm256_add_epi64(p02, p13);
1838
__m256i p9d_ = _mm256_add_epi64(p46, p57);
1840
return
v_uint64x4(_mm256_add_epi64(
1841
_mm256_unpacklo_epi64(p15_, p9d_),
1842
_mm256_unpackhi_epi64(p15_, p9d_)
1845
inline
v_uint64x4
v_dotprod_expand(
const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint64x4& c)
1848
inline
v_int64x4
v_dotprod_expand(
const
v_int16x16& a,
const
v_int16x16& b)
1850
__m256i prod = _mm256_madd_epi16(a.val, b.val);
1851
__m256i sign = _mm256_srai_epi32(prod, 31);
1853
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
1854
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
1856
return
v_int64x4(_mm256_add_epi64(
1857
_mm256_unpacklo_epi64(lo, hi),
1858
_mm256_unpackhi_epi64(lo, hi)
1861
inline
v_int64x4
v_dotprod_expand(
const
v_int16x16& a,
const
v_int16x16& b,
const
v_int64x4& c)
1865
inline
v_float64x4
v_dotprod_expand(
const
v_int32x8& a,
const
v_int32x8& b)
1867
inline
v_float64x4
v_dotprod_expand(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_float64x4& c)
1873
inline
v_int32x8
v_dotprod_fast(
const
v_int16x16& a,
const
v_int16x16& b)
1875
inline
v_int32x8
v_dotprod_fast(
const
v_int16x16& a,
const
v_int16x16& b,
const
v_int32x8& c)
1879
inline
v_int64x4
v_dotprod_fast(
const
v_int32x8& a,
const
v_int32x8& b)
1881
inline
v_int64x4
v_dotprod_fast(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_int64x4& c)
1887
inline
v_uint32x8
v_dotprod_expand_fast(
const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint32x8& c)
1898
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1899
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1900
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1901
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1903
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1904
__m256i p13 = _mm256_srli_epi64(mul0, 32);
1905
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1906
__m256i p57 = _mm256_srli_epi64(mul1, 32);
1908
__m256i p15_ = _mm256_add_epi64(p02, p13);
1909
__m256i p9d_ = _mm256_add_epi64(p46, p57);
1911
return
v_uint64x4(_mm256_add_epi64(p15_, p9d_));
1913
inline
v_uint64x4
v_dotprod_expand_fast(
const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint64x4& c)
1918
__m256i prod = _mm256_madd_epi16(a.val, b.val);
1919
__m256i sign = _mm256_srai_epi32(prod, 31);
1920
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
1921
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
1922
return
v_int64x4(_mm256_add_epi64(lo, hi));
1924
inline
v_int64x4
v_dotprod_expand_fast(
const
v_int16x16& a,
const
v_int16x16& b,
const
v_int64x4& c)
1930
inline
v_float64x4
v_dotprod_expand_fast(
const
v_int32x8& a,
const
v_int32x8& b,
const
v_float64x4& c)
1933
#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1934
v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1936
inline
v_float32x8
v_matmul(
const
v_float32x8& v,
const
v_float32x8& m0,
1937
const
v_float32x8& m1,
const
v_float32x8& m2,
1938
const
v_float32x8& m3)
1940
v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1941
v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1942
v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1943
v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1947
inline
v_float32x8
v_matmuladd(
const
v_float32x8& v,
const
v_float32x8& m0,
1948
const
v_float32x8& m1,
const
v_float32x8& m2,
1949
const
v_float32x8& a)
1951
v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1952
v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1953
v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1957
#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1958
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1959
const _Tpvec& a2, const _Tpvec& a3, \
1960
_Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1962
__m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1963
__m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1964
__m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1965
__m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1966
b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1967
b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1968
b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1969
b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1972OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1973OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1974OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1979
#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1980
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1982
b0.val = intrin(_v256_extract_low(a.val)); \
1983
b1.val = intrin(_v256_extract_high(a.val)); \
1985
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1986
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
1987
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1988
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
1989
inline _Tpwvec v256_load_expand(const _Tp* ptr) \
1991
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
1992
return _Tpwvec(intrin(a)); \
1995OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
1996OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
1997OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
1998OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8,
short, _mm256_cvtepi16_epi32)
1999OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4,
unsigned, _mm256_cvtepu32_epi64)
2000OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4,
int, _mm256_cvtepi32_epi64)
2002
#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2003
inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2005
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
2006
return _Tpvec(intrin(a)); \
2009OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
2010OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
2014
inline
v_int8x32 v_pack(
const
v_int16x16& a,
const
v_int16x16& b)
2015{
return
v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
2017
inline
v_uint8x32 v_pack(
const
v_uint16x16& a,
const
v_uint16x16& b)
2019
__m256i t = _mm256_set1_epi16(255);
2020
__m256i a1 = _mm256_min_epu16(a.val, t);
2021
__m256i b1 = _mm256_min_epu16(b.val, t);
2022
return
v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
2025
inline
v_uint8x32 v_pack_u(
const
v_int16x16& a,
const
v_int16x16& b)
2027
return
v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
2030
inline
void
v_pack_store(schar* ptr,
const
v_int16x16& a)
2033
inline
void
v_pack_store(uchar* ptr,
const
v_uint16x16& a)
2035
const
__m256i m = _mm256_set1_epi16(255);
2036
__m256i am = _mm256_min_epu16(a.val, m);
2037
am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
2041
inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x16& a)
2044
template<
int
n>
inline
2045v_uint8x32 v_rshr_pack(
const
v_uint16x16& a,
const
v_uint16x16& b)
2048
v_uint16x16 delta = v256_setall_u16((
short)(1 << (n-1)));
2049
return
v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2050
v_reinterpret_as_s16((b + delta) >> n));
2053
template<
int
n>
inline
2054
void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x16& a)
2056
v_uint16x16 delta = v256_setall_u16((
short)(1 << (n-1)));
2057
v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2060
template<
int
n>
inline
2061v_uint8x32 v_rshr_pack_u(
const
v_int16x16& a,
const
v_int16x16& b)
2063
v_int16x16 delta = v256_setall_s16((
short)(1 << (n-1)));
2064
return
v_pack_u((a + delta) >> n, (b + delta) >> n);
2067
template<
int
n>
inline
2068
void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x16& a)
2070
v_int16x16 delta = v256_setall_s16((
short)(1 << (n-1)));
2071
v_pack_u_store(ptr, (a + delta) >> n);
2074
template<
int
n>
inline
2075v_int8x32 v_rshr_pack(
const
v_int16x16& a,
const
v_int16x16& b)
2077
v_int16x16 delta = v256_setall_s16((
short)(1 << (n-1)));
2078
return
v_pack((a + delta) >> n, (b + delta) >> n);
2081
template<
int
n>
inline
2082
void
v_rshr_pack_store(schar* ptr,
const
v_int16x16& a)
2084
v_int16x16 delta = v256_setall_s16((
short)(1 << (n-1)));
2085
v_pack_store(ptr, (a + delta) >> n);
2089
inline
v_int16x16 v_pack(
const
v_int32x8& a,
const
v_int32x8& b)
2090{
return
v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
2092
inline
v_uint16x16 v_pack(
const
v_uint32x8& a,
const
v_uint32x8& b)
2093{
return
v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2095
inline
v_uint16x16 v_pack_u(
const
v_int32x8& a,
const
v_int32x8& b)
2096{
return
v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
2098
inline
void
v_pack_store(
short* ptr,
const
v_int32x8& a)
2101
inline
void
v_pack_store(ushort* ptr,
const
v_uint32x8& a)
2103
const
__m256i m = _mm256_set1_epi32(65535);
2104
__m256i am = _mm256_min_epu32(a.val, m);
2105
am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
2109
inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x8& a)
2113
template<
int
n>
inline
2114v_uint16x16 v_rshr_pack(
const
v_uint32x8& a,
const
v_uint32x8& b)
2117
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2118
return
v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2119
v_reinterpret_as_s32((b + delta) >> n));
2122
template<
int
n>
inline
2123
void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x8& a)
2125
v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2126
v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2129
template<
int
n>
inline
2130v_uint16x16 v_rshr_pack_u(
const
v_int32x8& a,
const
v_int32x8& b)
2132
v_int32x8 delta = v256_setall_s32(1 << (n-1));
2133
return
v_pack_u((a + delta) >> n, (b + delta) >> n);
2136
template<
int
n>
inline
2137
void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x8& a)
2139
v_int32x8 delta = v256_setall_s32(1 << (n-1));
2140
v_pack_u_store(ptr, (a + delta) >> n);
2143
template<
int
n>
inline
2144v_int16x16 v_rshr_pack(
const
v_int32x8& a,
const
v_int32x8& b)
2146
v_int32x8 delta = v256_setall_s32(1 << (n-1));
2147
return
v_pack((a + delta) >> n, (b + delta) >> n);
2150
template<
int
n>
inline
2151
void
v_rshr_pack_store(
short* ptr,
const
v_int32x8& a)
2153
v_int32x8 delta = v256_setall_s32(1 << (n-1));
2154
v_pack_store(ptr, (a + delta) >> n);
2159
inline
v_uint32x8 v_pack(
const
v_uint64x4& a,
const
v_uint64x4& b)
2161
__m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2162
__m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
2163
__m256i ab = _mm256_unpacklo_epi64(a0, b0);
2164
return
v_uint32x8(_v256_shuffle_odd_64(ab));
2167
inline
v_int32x8 v_pack(
const
v_int64x4& a,
const
v_int64x4& b)
2168{
return
v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2170
inline
void
v_pack_store(
unsigned* ptr,
const
v_uint64x4& a)
2172
__m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2173
v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2176
inline
void
v_pack_store(
int* ptr,
const
v_int64x4& b)
2177{ v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2179
template<
int
n>
inline
2180v_uint32x8 v_rshr_pack(
const
v_uint64x4& a,
const
v_uint64x4& b)
2182
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2183
return
v_pack((a + delta) >> n, (b + delta) >> n);
2186
template<
int
n>
inline
2187
void
v_rshr_pack_store(
unsigned* ptr,
const
v_uint64x4& a)
2189
v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2190
v_pack_store(ptr, (a + delta) >> n);
2193
template<
int
n>
inline
2194v_int32x8 v_rshr_pack(
const
v_int64x4& a,
const
v_int64x4& b)
2196
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2197
return
v_pack((a + delta) >> n, (b + delta) >> n);
2200
template<
int
n>
inline
2201
void
v_rshr_pack_store(
int* ptr,
const
v_int64x4& a)
2203
v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2204
v_pack_store(ptr, (a + delta) >> n);
2208
inline
v_uint8x32
v_pack_b(
const
v_uint16x16& a,
const
v_uint16x16& b)
2210
__m256i ab = _mm256_packs_epi16(a.val, b.val);
2211
return
v_uint8x32(_v256_shuffle_odd_64(ab));
2214
inline
v_uint8x32
v_pack_b(
const
v_uint32x8& a,
const
v_uint32x8& b,
2215
const
v_uint32x8& c,
const
v_uint32x8& d)
2217
__m256i ab = _mm256_packs_epi32(a.val, b.val);
2218
__m256i cd = _mm256_packs_epi32(c.val, d.val);
2220
__m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
2221
return
v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
2224
inline
v_uint8x32
v_pack_b(
const
v_uint64x4& a,
const
v_uint64x4& b,
const
v_uint64x4& c,
2225
const
v_uint64x4& d,
const
v_uint64x4& e,
const
v_uint64x4& f,
2226
const
v_uint64x4& g,
const
v_uint64x4& h)
2228
__m256i ab = _mm256_packs_epi32(a.val, b.val);
2229
__m256i cd = _mm256_packs_epi32(c.val, d.val);
2230
__m256i ef = _mm256_packs_epi32(e.val, f.val);
2231
__m256i gh = _mm256_packs_epi32(g.val, h.val);
2233
__m256i abcd = _mm256_packs_epi32(ab, cd);
2234
__m256i efgh = _mm256_packs_epi32(ef, gh);
2235
__m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
2237
__m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
2238
return
v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
2245
#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
2247
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2248
{ return v_rotate_right<s>(a, b); }
2250OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
2251OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
2252OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
2253OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
2254OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
2255OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
2256OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
2257OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
2258OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
2259OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
2264
return
(uchar)_v256_extract_epi8<i>(a.val);
2270
return
(schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2276
return
(ushort)_v256_extract_epi16<i>(a.val);
2282
return
(
short)v_extract_n<i>(v_reinterpret_as_u16(a));
2288
return
(uint)_v256_extract_epi32<i>(a.val);
2294
return
(
int)v_extract_n<i>(v_reinterpret_as_u32(a));
2300
return
(uint64)_v256_extract_epi64<i>(a.val);
2306
return
(int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2312
union
{ uint iv;
float
fv; } d;
2313
d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2320
union
{ uint64 iv;
double
dv; } d;
2321
d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2328
static
const
__m256i perm = _mm256_set1_epi32((
char)i);
2329
return
v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
2334{
return
v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2338{
return
v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2345
__m256i ab0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2346
__m256i ab1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 32));
2348
const
__m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
2349
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
2350
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2351
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2352
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2353
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2354
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2355
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2362
__m256i ab0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2363
__m256i ab1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 16));
2365
const
__m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2366
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
2367
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2368
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2369
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2370
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2371
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2372
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2373
a = v_uint16x16(a0);
2374
b = v_uint16x16(b0);
2379
__m256i ab0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2380
__m256i ab1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 8));
2382
const
int
sh = 0+2*4+1*16+3*64;
2383
__m256i p0 = _mm256_shuffle_epi32(ab0, sh);
2384
__m256i p1 = _mm256_shuffle_epi32(ab1, sh);
2385
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2386
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2387
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2388
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2395
__m256i ab0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2396
__m256i ab1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 4));
2398
__m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
2399
__m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
2400
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2401
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2406
inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2408
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2409
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 32));
2410
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 64));
2412
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2413
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2415
const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2416
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2417
const
__m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2418
-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2420
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2421
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2422
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2425
sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2426
0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2427
sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2428
1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2429
sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2430
2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2431
b0 = _mm256_shuffle_epi8(b0, sh_b);
2432
g0 = _mm256_shuffle_epi8(g0, sh_g);
2433
r0 = _mm256_shuffle_epi8(r0, sh_r);
2440
inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2442
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2443
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 16));
2444
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 32));
2446
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2447
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2449
const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2450
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2451
const
__m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2452
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2453
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2454
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2455
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2456
const
__m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2457
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2458
const
__m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2459
2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2460
const
__m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2461
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2462
b0 = _mm256_shuffle_epi8(b0, sh_b);
2463
g0 = _mm256_shuffle_epi8(g0, sh_g);
2464
r0 = _mm256_shuffle_epi8(r0, sh_r);
2466
a = v_uint16x16(b0);
2467
b = v_uint16x16(g0);
2468
c = v_uint16x16(r0);
2471
inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2473
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2474
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 8));
2475
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 16));
2477
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2478
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2480
__m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
2481
__m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
2482
__m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
2484
b0 = _mm256_shuffle_epi32(b0, 0x6c);
2485
g0 = _mm256_shuffle_epi32(g0, 0xb1);
2486
r0 = _mm256_shuffle_epi32(r0, 0xc6);
2493
inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2495
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2496
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 4));
2497
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 8));
2499
__m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
2500
__m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
2501
__m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
2502
__m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
2503
__m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
2504
__m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
2511
inline
void
v_load_deinterleave(
const
uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
2513
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2514
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 32));
2515
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 64));
2516
__m256i bgr3 = _mm256_loadu_si256((
const
__m256i*)(ptr + 96));
2517
const
__m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
2518
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
2520
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2521
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2522
__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2523
__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2525
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2526
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2527
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2528
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2530
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2531
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2532
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2533
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2535
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2536
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2537
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2538
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2546
inline
void
v_load_deinterleave(
const
ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
2548
__m256i bgr0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2549
__m256i bgr1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 16));
2550
__m256i bgr2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 32));
2551
__m256i bgr3 = _mm256_loadu_si256((
const
__m256i*)(ptr + 48));
2552
const
__m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2553
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
2554
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2555
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2556
__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2557
__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2559
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2560
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2561
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2562
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2564
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2565
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2566
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2567
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2569
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2570
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2571
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2572
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2574
a = v_uint16x16(b0);
2575
b = v_uint16x16(g0);
2576
c = v_uint16x16(r0);
2577
d = v_uint16x16(a0);
2580
inline
void
v_load_deinterleave(
const
unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2582
__m256i p0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2583
__m256i p1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 8));
2584
__m256i p2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 16));
2585
__m256i p3 = _mm256_loadu_si256((
const
__m256i*)(ptr + 24));
2587
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2588
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2589
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2590
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2592
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2593
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2594
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2595
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2597
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2598
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2599
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2600
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2608
inline
void
v_load_deinterleave(
const
uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2610
__m256i bgra0 = _mm256_loadu_si256((
const
__m256i*)ptr);
2611
__m256i bgra1 = _mm256_loadu_si256((
const
__m256i*)(ptr + 4));
2612
__m256i bgra2 = _mm256_loadu_si256((
const
__m256i*)(ptr + 8));
2613
__m256i bgra3 = _mm256_loadu_si256((
const
__m256i*)(ptr + 12));
2615
__m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
2616
__m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
2617
__m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
2618
__m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
2620
__m256i b0 = _mm256_unpacklo_epi64(l02, l13);
2621
__m256i g0 = _mm256_unpackhi_epi64(l02, l13);
2622
__m256i r0 = _mm256_unpacklo_epi64(h02, h13);
2623
__m256i a0 = _mm256_unpackhi_epi64(h02, h13);
2633
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& x,
const
v_uint8x32& y,
2634
hal::StoreMode mode=hal::STORE_UNALIGNED )
2636
__m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
2637
__m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
2639
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2640
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2642
if( mode == hal::STORE_ALIGNED_NOCACHE )
2644
_mm256_stream_si256((__m256i*)ptr, xy0);
2645
_mm256_stream_si256((__m256i*)(ptr + 32), xy1);
2647
else
if( mode == hal::STORE_ALIGNED )
2649
_mm256_store_si256((__m256i*)ptr, xy0);
2650
_mm256_store_si256((__m256i*)(ptr + 32), xy1);
2654
_mm256_storeu_si256((__m256i*)ptr, xy0);
2655
_mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
2659
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& x,
const
v_uint16x16& y,
2660
hal::StoreMode mode=hal::STORE_UNALIGNED )
2662
__m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
2663
__m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
2665
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2666
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2668
if( mode == hal::STORE_ALIGNED_NOCACHE )
2670
_mm256_stream_si256((__m256i*)ptr, xy0);
2671
_mm256_stream_si256((__m256i*)(ptr + 16), xy1);
2673
else
if( mode == hal::STORE_ALIGNED )
2675
_mm256_store_si256((__m256i*)ptr, xy0);
2676
_mm256_store_si256((__m256i*)(ptr + 16), xy1);
2680
_mm256_storeu_si256((__m256i*)ptr, xy0);
2681
_mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
2685
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& x,
const
v_uint32x8& y,
2686
hal::StoreMode mode=hal::STORE_UNALIGNED )
2688
__m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
2689
__m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
2691
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2692
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2694
if( mode == hal::STORE_ALIGNED_NOCACHE )
2696
_mm256_stream_si256((__m256i*)ptr, xy0);
2697
_mm256_stream_si256((__m256i*)(ptr + 8), xy1);
2699
else
if( mode == hal::STORE_ALIGNED )
2701
_mm256_store_si256((__m256i*)ptr, xy0);
2702
_mm256_store_si256((__m256i*)(ptr + 8), xy1);
2706
_mm256_storeu_si256((__m256i*)ptr, xy0);
2707
_mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2711
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& x,
const
v_uint64x4& y,
2712
hal::StoreMode mode=hal::STORE_UNALIGNED )
2714
__m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2715
__m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2717
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2718
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2720
if( mode == hal::STORE_ALIGNED_NOCACHE )
2722
_mm256_stream_si256((__m256i*)ptr, xy0);
2723
_mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2725
else
if( mode == hal::STORE_ALIGNED )
2727
_mm256_store_si256((__m256i*)ptr, xy0);
2728
_mm256_store_si256((__m256i*)(ptr + 4), xy1);
2732
_mm256_storeu_si256((__m256i*)ptr, xy0);
2733
_mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2737
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& a,
const
v_uint8x32& b,
const
v_uint8x32& c,
2738
hal::StoreMode mode=hal::STORE_UNALIGNED )
2740
const
__m256i sh_b = _mm256_setr_epi8(
2741
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2742
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2743
const
__m256i sh_g = _mm256_setr_epi8(
2744
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2745
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2746
const
__m256i sh_r = _mm256_setr_epi8(
2747
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2748
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2750
__m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2751
__m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2752
__m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2754
const
__m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2755
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2756
const
__m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2757
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2759
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2760
__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2761
__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2763
__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2764
__m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2765
__m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2767
if( mode == hal::STORE_ALIGNED_NOCACHE )
2769
_mm256_stream_si256((__m256i*)ptr, bgr0);
2770
_mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2771
_mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2773
else
if( mode == hal::STORE_ALIGNED )
2775
_mm256_store_si256((__m256i*)ptr, bgr0);
2776
_mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2777
_mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2781
_mm256_storeu_si256((__m256i*)ptr, bgr0);
2782
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2783
_mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2787
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& a,
const
v_uint16x16& b,
const
v_uint16x16& c,
2788
hal::StoreMode mode=hal::STORE_UNALIGNED )
2790
const
__m256i sh_b = _mm256_setr_epi8(
2791
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2792
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2793
const
__m256i sh_g = _mm256_setr_epi8(
2794
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2795
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2796
const
__m256i sh_r = _mm256_setr_epi8(
2797
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2798
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2800
__m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2801
__m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2802
__m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2804
const
__m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2805
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2806
const
__m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2807
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2809
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2810
__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2811
__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2813
__m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2815
__m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2817
if( mode == hal::STORE_ALIGNED_NOCACHE )
2819
_mm256_stream_si256((__m256i*)ptr, bgr0);
2820
_mm256_stream_si256((__m256i*)(ptr + 16), p1);
2821
_mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2823
else
if( mode == hal::STORE_ALIGNED )
2825
_mm256_store_si256((__m256i*)ptr, bgr0);
2826
_mm256_store_si256((__m256i*)(ptr + 16), p1);
2827
_mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2831
_mm256_storeu_si256((__m256i*)ptr, bgr0);
2832
_mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2833
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2837
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& a,
const
v_uint32x8& b,
const
v_uint32x8& c,
2838
hal::StoreMode mode=hal::STORE_UNALIGNED )
2840
__m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
2841
__m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
2842
__m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
2844
__m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2845
__m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2846
__m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2848
__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2850
__m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2852
if( mode == hal::STORE_ALIGNED_NOCACHE )
2854
_mm256_stream_si256((__m256i*)ptr, bgr0);
2855
_mm256_stream_si256((__m256i*)(ptr + 8), p2);
2856
_mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2858
else
if( mode == hal::STORE_ALIGNED )
2860
_mm256_store_si256((__m256i*)ptr, bgr0);
2861
_mm256_store_si256((__m256i*)(ptr + 8), p2);
2862
_mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2866
_mm256_storeu_si256((__m256i*)ptr, bgr0);
2867
_mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2868
_mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2872
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& a,
const
v_uint64x4& b,
const
v_uint64x4& c,
2873
hal::StoreMode mode=hal::STORE_UNALIGNED )
2875
__m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
2876
__m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
2877
__m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
2879
__m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2880
__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2881
__m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2883
if( mode == hal::STORE_ALIGNED_NOCACHE )
2885
_mm256_stream_si256((__m256i*)ptr, bgr0);
2886
_mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2887
_mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2889
else
if( mode == hal::STORE_ALIGNED )
2891
_mm256_store_si256((__m256i*)ptr, bgr0);
2892
_mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2893
_mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2897
_mm256_storeu_si256((__m256i*)ptr, bgr0);
2898
_mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2899
_mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2903
inline
void
v_store_interleave( uchar* ptr,
const
v_uint8x32& a,
const
v_uint8x32& b,
2904
const
v_uint8x32& c,
const
v_uint8x32& d,
2905
hal::StoreMode mode=hal::STORE_UNALIGNED )
2907
__m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
2908
__m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
2909
__m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
2910
__m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
2912
__m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2913
__m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2914
__m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2915
__m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2917
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2918
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2919
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2920
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2922
if( mode == hal::STORE_ALIGNED_NOCACHE )
2924
_mm256_stream_si256((__m256i*)ptr, bgra0);
2925
_mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2926
_mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2927
_mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2929
else
if( mode == hal::STORE_ALIGNED )
2931
_mm256_store_si256((__m256i*)ptr, bgra0);
2932
_mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2933
_mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2934
_mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2938
_mm256_storeu_si256((__m256i*)ptr, bgra0);
2939
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2940
_mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2941
_mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2945
inline
void
v_store_interleave( ushort* ptr,
const
v_uint16x16& a,
const
v_uint16x16& b,
2946
const
v_uint16x16& c,
const
v_uint16x16& d,
2947
hal::StoreMode mode=hal::STORE_UNALIGNED )
2949
__m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
2950
__m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
2951
__m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
2952
__m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
2954
__m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2955
__m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2956
__m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2957
__m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2959
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2960
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2961
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2962
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2964
if( mode == hal::STORE_ALIGNED_NOCACHE )
2966
_mm256_stream_si256((__m256i*)ptr, bgra0);
2967
_mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2968
_mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2969
_mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2971
else
if( mode == hal::STORE_ALIGNED )
2973
_mm256_store_si256((__m256i*)ptr, bgra0);
2974
_mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2975
_mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2976
_mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2980
_mm256_storeu_si256((__m256i*)ptr, bgra0);
2981
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2982
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2983
_mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2987
inline
void
v_store_interleave(
unsigned* ptr,
const
v_uint32x8& a,
const
v_uint32x8& b,
2988
const
v_uint32x8& c,
const
v_uint32x8& d,
2989
hal::StoreMode mode=hal::STORE_UNALIGNED )
2991
__m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
2992
__m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
2993
__m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
2994
__m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
2996
__m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
2997
__m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
2998
__m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
2999
__m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
3001
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
3002
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
3003
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
3004
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
3006
if( mode == hal::STORE_ALIGNED_NOCACHE )
3008
_mm256_stream_si256((__m256i*)ptr, bgra0);
3009
_mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
3010
_mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
3011
_mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
3013
else
if( mode == hal::STORE_ALIGNED )
3015
_mm256_store_si256((__m256i*)ptr, bgra0);
3016
_mm256_store_si256((__m256i*)(ptr + 8), bgra1);
3017
_mm256_store_si256((__m256i*)(ptr + 16), bgra2);
3018
_mm256_store_si256((__m256i*)(ptr + 24), bgra3);
3022
_mm256_storeu_si256((__m256i*)ptr, bgra0);
3023
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
3024
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
3025
_mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
3029
inline
void
v_store_interleave( uint64* ptr,
const
v_uint64x4& a,
const
v_uint64x4& b,
3030
const
v_uint64x4& c,
const
v_uint64x4& d,
3031
hal::StoreMode mode=hal::STORE_UNALIGNED )
3033
__m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
3034
__m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
3035
__m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
3036
__m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
3038
__m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
3039
__m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
3040
__m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
3041
__m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
3043
if( mode == hal::STORE_ALIGNED_NOCACHE )
3045
_mm256_stream_si256((__m256i*)ptr, bgra0);
3046
_mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
3047
_mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
3048
_mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
3050
else
if( mode == hal::STORE_ALIGNED )
3052
_mm256_store_si256((__m256i*)ptr, bgra0);
3053
_mm256_store_si256((__m256i*)(ptr + 4), bgra1);
3054
_mm256_store_si256((__m256i*)(ptr + 8), bgra2);
3055
_mm256_store_si256((__m256i*)(ptr + 12), bgra3);
3059
_mm256_storeu_si256((__m256i*)ptr, bgra0);
3060
_mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
3061
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
3062
_mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
3066
#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
3067
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
3070
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
3071
a0 = v_reinterpret_as_##suffix0(a1); \
3072
b0 = v_reinterpret_as_##suffix0(b1); \
3074
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
3076
_Tpvec1 a1, b1, c1; \
3077
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
3078
a0 = v_reinterpret_as_##suffix0(a1); \
3079
b0 = v_reinterpret_as_##suffix0(b1); \
3080
c0 = v_reinterpret_as_##suffix0(c1); \
3082
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
3084
_Tpvec1 a1, b1, c1, d1; \
3085
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
3086
a0 = v_reinterpret_as_##suffix0(a1); \
3087
b0 = v_reinterpret_as_##suffix0(b1); \
3088
c0 = v_reinterpret_as_##suffix0(c1); \
3089
d0 = v_reinterpret_as_##suffix0(d1); \
3091
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3092
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3094
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3095
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3096
v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3098
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3099
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3101
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3102
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3103
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3104
v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3106
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3107
const _Tpvec0& c0, const _Tpvec0& d0, \
3108
hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3110
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3111
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3112
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3113
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3114
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3117OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
3118OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16,
short, s16, v_uint16x16, ushort, u16)
3119OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8,
int, s32, v_uint32x8,
unsigned, u32)
3120OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8,
float, f32, v_uint32x8,
unsigned, u32)
3121OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
3122OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4,
double, f64, v_uint64x4, uint64, u64)
3128
inline
v_float32x8 v256_load_expand(
const
float16_t* ptr)
3131
return
v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((
const
__m128i*)ptr)));
3133
float
CV_DECL_ALIGNED(32) buf[8];
3134
for (
int
i = 0; i < 8; i++)
3135
buf[i] = (
float)ptr[i];
3136
return v256_load_aligned(buf);
3140
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x8& a)
3143
__m128i ah = _mm256_cvtps_ph(a.val, 0);
3144
_mm_storeu_si128((__m128i*)ptr, ah);
3146
float
CV_DECL_ALIGNED(32) buf[8];
3148
for (
int
i = 0; i < 8; i++)
3149
ptr[i] = float16_t(buf[i]);
3157
inline
void
v256_cleanup() { _mm256_zeroall(); }
3159CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition:
intrin_cpp.hpp:1557
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition:
intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition:
intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition:
intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition:
intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition:
intrin_cpp.hpp:2118
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition:
intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition:
intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition:
intrin_cpp.hpp:2254
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition:
intrin_cpp.hpp:2046
CV_EXPORTS void swap(Mat &a, Mat &b)
Swaps two matrices
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75