45
#ifndef OPENCV_HAL_SSE_HPP
46
#define OPENCV_HAL_SSE_HPP
49
#include "opencv2/core/utility.hpp"
52
#define CV_SIMD128_64F 1
53
#define CV_SIMD128_FP16 0
68CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
74
typedef
uchar lane_type;
75
typedef
__m128i vector_type;
80
explicit
v_uint8x16(__m128i v) : val(v) {}
81
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
84
val = _mm_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
85
(
char)v4, (
char)v5, (
char)v6, (
char)v7,
86
(
char)v8, (
char)v9, (
char)v10, (
char)v11,
87
(
char)v12, (
char)v13, (
char)v14, (
char)v15);
92
return
(uchar)_mm_cvtsi128_si32(val);
100
typedef
schar lane_type;
101
typedef
__m128i vector_type;
102
enum
{ nlanes = 16 };
106
explicit
v_int8x16(__m128i v) : val(v) {}
107
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
110
val = _mm_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
111
(
char)v4, (
char)v5, (
char)v6, (
char)v7,
112
(
char)v8, (
char)v9, (
char)v10, (
char)v11,
113
(
char)v12, (
char)v13, (
char)v14, (
char)v15);
118
return
(schar)_mm_cvtsi128_si32(val);
126
typedef
ushort lane_type;
127
typedef
__m128i vector_type;
132
explicit
v_uint16x8(__m128i v) : val(v) {}
133
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
135
val = _mm_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
136
(
short)v4, (
short)v5, (
short)v6, (
short)v7);
141
return
(ushort)_mm_cvtsi128_si32(val);
149
typedef
short
lane_type;
150
typedef
__m128i vector_type;
155
explicit
v_int16x8(__m128i v) : val(v) {}
156
v_int16x8(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7)
158
val = _mm_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
159
(
short)v4, (
short)v5, (
short)v6, (
short)v7);
164
return
(
short)_mm_cvtsi128_si32(val);
172
typedef
unsigned
lane_type;
173
typedef
__m128i vector_type;
178
explicit
v_uint32x4(__m128i v) : val(v) {}
179
v_uint32x4(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3)
181
val = _mm_setr_epi32((
int)v0, (
int)v1, (
int)v2, (
int)v3);
184
unsigned
get0()
const
186
return
(
unsigned)_mm_cvtsi128_si32(val);
194
typedef
int
lane_type;
195
typedef
__m128i vector_type;
200
explicit
v_int32x4(__m128i v) : val(v) {}
201
v_int32x4(
int
v0,
int
v1,
int
v2,
int
v3)
203
val = _mm_setr_epi32(v0, v1, v2, v3);
208
return
_mm_cvtsi128_si32(val);
216
typedef
float
lane_type;
217
typedef
__m128 vector_type;
222
explicit
v_float32x4(__m128 v) : val(v) {}
223
v_float32x4(
float
v0,
float
v1,
float
v2,
float
v3)
225
val = _mm_setr_ps(v0, v1, v2, v3);
230
return
_mm_cvtss_f32(val);
238
typedef
uint64 lane_type;
239
typedef
__m128i vector_type;
244
explicit
v_uint64x2(__m128i v) : val(v) {}
245
v_uint64x2(uint64 v0, uint64 v1)
247
val = _mm_setr_epi32((
int)v0, (
int)(v0 >> 32), (
int)v1, (
int)(v1 >> 32));
252
#if !defined(__x86_64__) && !defined(_M_X64)
253
int
a = _mm_cvtsi128_si32(val);
254
int
b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
255
return
(
unsigned)a | ((uint64)(
unsigned)b << 32);
257
return
(uint64)_mm_cvtsi128_si64(val);
266
typedef
int64 lane_type;
267
typedef
__m128i vector_type;
272
explicit
v_int64x2(__m128i v) : val(v) {}
273
v_int64x2(int64 v0, int64 v1)
275
val = _mm_setr_epi32((
int)v0, (
int)(v0 >> 32), (
int)v1, (
int)(v1 >> 32));
280
#if !defined(__x86_64__) && !defined(_M_X64)
281
int
a = _mm_cvtsi128_si32(val);
282
int
b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
283
return
(int64)((unsigned)a | ((uint64)(unsigned)b << 32));
285
return
_mm_cvtsi128_si64(val);
294
typedef
double
lane_type;
295
typedef
__m128d vector_type;
300
explicit
v_float64x2(__m128d v) : val(v) {}
301
v_float64x2(
double
v0,
double
v1)
303
val = _mm_setr_pd(v0, v1);
308
return
_mm_cvtsd_f64(val);
314
namespace
hal_sse_internal
316
template
<
typename
to_sse_type,
typename
from_sse_type>
317
to_sse_type v_sse_reinterpret_as(
const
from_sse_type& val);
319
#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
321
to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
322
{ return sse_cast_intrin(a); }
324
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
325
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
326
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
327
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
328
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
329
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
330
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
331
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
332
OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
335
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
336
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
337
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
338
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
339
{ return _Tpvec(cast(a.val)); }
341OPENCV_HAL_IMPL_SSE_INITVEC(
v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
342OPENCV_HAL_IMPL_SSE_INITVEC(
v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
343OPENCV_HAL_IMPL_SSE_INITVEC(
v_uint16x8, ushort, u16, si128, epi16,
short, OPENCV_HAL_NOP)
344OPENCV_HAL_IMPL_SSE_INITVEC(
v_int16x8,
short, s16, si128, epi16,
short, OPENCV_HAL_NOP)
345OPENCV_HAL_IMPL_SSE_INITVEC(
v_uint32x4,
unsigned, u32, si128, epi32,
int, OPENCV_HAL_NOP)
346OPENCV_HAL_IMPL_SSE_INITVEC(
v_int32x4,
int, s32, si128, epi32,
int, OPENCV_HAL_NOP)
347OPENCV_HAL_IMPL_SSE_INITVEC(
v_float32x4,
float, f32, ps, ps,
float, _mm_castsi128_ps)
348OPENCV_HAL_IMPL_SSE_INITVEC(
v_float64x2,
double, f64, pd, pd,
double, _mm_castsi128_pd)
355
template<
typename
_Tpvec>
inline
357
template<
typename
_Tpvec>
inline
368
#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
369
inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
370
{ return _Tpvec(_mm_castps_si128(a.val)); } \
371
inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
372
{ return _Tpvec(_mm_castpd_si128(a.val)); }
374OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint8x16, u8)
375OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int8x16, s8)
376OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint16x8, u16)
377OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int16x8, s16)
378OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint32x4, u32)
379OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int32x4, s32)
380OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint64x2, u64)
381OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int64x2, s64)
391
__m128i delta = _mm_set1_epi16(255);
392
return
v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
393
_mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
396
inline
void
v_pack_store(uchar* ptr,
const
v_uint16x8& a)
398
__m128i delta = _mm_set1_epi16(255);
399
__m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
400
_mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
404{
return
v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
406
inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x8& a)
407{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
409
template<
int
n>
inline
413
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
414
return
v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
415
_mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
418
template<
int
n>
inline
419
void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x8& a)
421
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
422
__m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
423
_mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
426
template<
int
n>
inline
429
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
430
return
v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
431
_mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
434
template<
int
n>
inline
435
void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x8& a)
437
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
438
__m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
439
_mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
443{
return
v_int8x16(_mm_packs_epi16(a.val, b.val)); }
445
inline
void
v_pack_store(schar* ptr,
const
v_int16x8& a)
446{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
448
template<
int
n>
inline
452
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
453
return
v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
454
_mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
456
template<
int
n>
inline
457
void
v_rshr_pack_store(schar* ptr,
const
v_int16x8& a)
460
__m128i delta = _mm_set1_epi16((
short)(1 << (n-1)));
461
__m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
462
_mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
467
inline
__m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
470
return
_mm_blendv_epi8(b, a, mask);
472
return
_mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
477{
return
v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
479
inline
void
v_pack_store(ushort* ptr,
const
v_uint32x4& a)
481
__m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
482
__m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
483
__m128i r = _mm_packs_epi32(a1, a1);
484
_mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
487
template<
int
n>
inline
490
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
491
__m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
492
__m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
493
return
v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
496
template<
int
n>
inline
497
void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x4& a)
499
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
500
__m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
501
__m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
502
_mm_storel_epi64((__m128i*)ptr, a2);
508
return
v_uint16x8(_mm_packus_epi32(a.val, b.val));
510
__m128i delta32 = _mm_set1_epi32(32768);
513
__m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
514
__m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
516
__m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
517
return
v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
521
inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x4& a)
524
_mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
526
__m128i delta32 = _mm_set1_epi32(32768);
527
__m128i a1 = _mm_sub_epi32(a.val, delta32);
528
__m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
529
_mm_storel_epi64((__m128i*)ptr, r);
533
template<
int
n>
inline
537
__m128i delta = _mm_set1_epi32(1 << (n - 1));
538
return
v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
539
_mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
541
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
542
__m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
543
__m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
544
__m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
545
__m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
546
return
v_uint16x8(_mm_unpacklo_epi64(a2, b2));
550
template<
int
n>
inline
551
void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x4& a)
554
__m128i delta = _mm_set1_epi32(1 << (n - 1));
555
__m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
556
_mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
558
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
559
__m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
560
__m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
561
_mm_storel_epi64((__m128i*)ptr, a2);
566{
return
v_int16x8(_mm_packs_epi32(a.val, b.val)); }
568
inline
void
v_pack_store(
short* ptr,
const
v_int32x4& a)
570
_mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
573
template<
int
n>
inline
576
__m128i delta = _mm_set1_epi32(1 << (n-1));
577
return
v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
578
_mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
581
template<
int
n>
inline
582
void
v_rshr_pack_store(
short* ptr,
const
v_int32x4& a)
584
__m128i delta = _mm_set1_epi32(1 << (n-1));
585
__m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
586
_mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
593
__m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
594
__m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
595
return
v_uint32x4(_mm_unpacklo_epi32(v0, v1));
598
inline
void
v_pack_store(
unsigned* ptr,
const
v_uint64x2& a)
600
__m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
601
_mm_storel_epi64((__m128i*)ptr, a1);
607
__m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
608
__m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
609
return
v_int32x4(_mm_unpacklo_epi32(v0, v1));
612
inline
void
v_pack_store(
int* ptr,
const
v_int64x2& a)
614
__m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
615
_mm_storel_epi64((__m128i*)ptr, a1);
618
template<
int
n>
inline
621
uint64 delta = (uint64)1 << (n-1);
623
__m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
624
__m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
625
__m128i v0 = _mm_unpacklo_epi32(a1, b1);
626
__m128i v1 = _mm_unpackhi_epi32(a1, b1);
627
return
v_uint32x4(_mm_unpacklo_epi32(v0, v1));
630
template<
int
n>
inline
631
void
v_rshr_pack_store(
unsigned* ptr,
const
v_uint64x2& a)
633
uint64 delta = (uint64)1 << (n-1);
635
__m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636
__m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
637
_mm_storel_epi64((__m128i*)ptr, a2);
640
inline
__m128i v_sign_epi64(__m128i a)
642
return
_mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
645
inline
__m128i v_srai_epi64(__m128i a,
int
imm)
647
__m128i smask = v_sign_epi64(a);
648
return
_mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
651
template<
int
n>
inline
654
int64 delta = (int64)1 << (n-1);
656
__m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
657
__m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
658
__m128i v0 = _mm_unpacklo_epi32(a1, b1);
659
__m128i v1 = _mm_unpackhi_epi32(a1, b1);
660
return
v_int32x4(_mm_unpacklo_epi32(v0, v1));
663
template<
int
n>
inline
664
void
v_rshr_pack_store(
int* ptr,
const
v_int64x2& a)
666
int64 delta = (int64)1 << (n-1);
668
__m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669
__m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
670
_mm_storel_epi64((__m128i*)ptr, a2);
676
__m128i ab = _mm_packs_epi16(a.val, b.val);
683
__m128i ab = _mm_packs_epi32(a.val, b.val);
684
__m128i cd = _mm_packs_epi32(c.val, d.val);
692
__m128i ab = _mm_packs_epi32(a.val, b.val);
693
__m128i cd = _mm_packs_epi32(c.val, d.val);
694
__m128i ef = _mm_packs_epi32(e.val, f.val);
695
__m128i gh = _mm_packs_epi32(g.val, h.val);
697
__m128i abcd = _mm_packs_epi32(ab, cd);
698
__m128i efgh = _mm_packs_epi32(ef, gh);
699
return
v_uint8x16(_mm_packs_epi16(abcd, efgh));
706
__m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
707
__m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
708
__m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
709
__m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
711
return
v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
718
__m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719
__m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720
__m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
722
return
v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
725
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
726
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
728
return _Tpvec(intrin(a.val, b.val)); \
730
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
732
a.val = intrin(a.val, b.val); \
736OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint8x16, _mm_adds_epu8)
737OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint8x16, _mm_subs_epu8)
738OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int8x16, _mm_adds_epi8)
739OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int8x16, _mm_subs_epi8)
740OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint16x8, _mm_adds_epu16)
741OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint16x8, _mm_subs_epu16)
742OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int16x8, _mm_adds_epi16)
743OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int16x8, _mm_subs_epi16)
744OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint32x4, _mm_add_epi32)
745OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint32x4, _mm_sub_epi32)
746OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_uint32x4, _v128_mullo_epi32)
747OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int32x4, _mm_add_epi32)
748OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int32x4, _mm_sub_epi32)
749OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_int32x4, _v128_mullo_epi32)
750OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_float32x4, _mm_add_ps)
751OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_float32x4, _mm_sub_ps)
752OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_float32x4, _mm_mul_ps)
753OPENCV_HAL_IMPL_SSE_BIN_OP(/,
v_float32x4, _mm_div_ps)
754OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_float64x2, _mm_add_pd)
755OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_float64x2, _mm_sub_pd)
756OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_float64x2, _mm_mul_pd)
757OPENCV_HAL_IMPL_SSE_BIN_OP(/,
v_float64x2, _mm_div_pd)
758OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint64x2, _mm_add_epi64)
759OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint64x2, _mm_sub_epi64)
760OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int64x2, _mm_add_epi64)
761OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int64x2, _mm_sub_epi64)
764
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
765
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
768
v_mul_expand(a, b, c, d); \
769
return v_pack(c, d); \
771
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
772
{ a = a * b; return a; }
786
c = v_mul_wrap(a0, b0);
787
d = v_mul_wrap(a1, b1);
796
c = v_mul_wrap(a0, b0);
797
d = v_mul_wrap(a1, b1);
803
__m128i v0 = _mm_mullo_epi16(a.val, b.val);
804
__m128i v1 = _mm_mulhi_epi16(a.val, b.val);
805
c.val = _mm_unpacklo_epi16(v0, v1);
806
d.val = _mm_unpackhi_epi16(v0, v1);
812
__m128i v0 = _mm_mullo_epi16(a.val, b.val);
813
__m128i v1 = _mm_mulhi_epu16(a.val, b.val);
814
c.val = _mm_unpacklo_epi16(v0, v1);
815
d.val = _mm_unpackhi_epi16(v0, v1);
821
__m128i c0 = _mm_mul_epu32(a.val, b.val);
822
__m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
823
c.val = _mm_unpacklo_epi64(c0, c1);
824
d.val = _mm_unpackhi_epi64(c0, c1);
834{
return
v_int32x4(_mm_madd_epi16(a.val, b.val)); }
842
__m128i even = _mm_mul_epi32(a.val, b.val);
843
__m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
844
return
v_int64x2(_mm_add_epi64(even, odd));
846
__m128i even_u = _mm_mul_epu32(a.val, b.val);
847
__m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
849
__m128i a_sign = _mm_srai_epi32(a.val, 31);
850
__m128i b_sign = _mm_srai_epi32(b.val, 31);
852
__m128i axb = _mm_and_si128(a.val, b_sign);
853
__m128i bxa = _mm_and_si128(b.val, a_sign);
855
__m128i ssum = _mm_add_epi32(bxa, axb);
856
__m128i even_ssum = _mm_slli_epi64(ssum, 32);
857
__m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
859
return
v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
868
__m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8);
869
__m128i a1 = _mm_srli_epi16(a.val, 8);
870
__m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
871
__m128i b1 = _mm_srli_epi16(b.val, 8);
872
__m128i p0 = _mm_madd_epi16(a0, b0);
873
__m128i p1 = _mm_madd_epi16(a1, b1);
881
__m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8);
882
__m128i a1 = _mm_srai_epi16(a.val, 8);
883
__m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
884
__m128i b1 = _mm_srai_epi16(b.val, 8);
885
__m128i p0 = _mm_madd_epi16(a0, b0);
886
__m128i p1 = _mm_madd_epi16(a1, b1);
904
_mm_unpacklo_epi64(c0.val, d0.val),
905
_mm_unpackhi_epi64(c0.val, d0.val)
917
_mm_unpacklo_epi64(c.val, d.val),
918
_mm_unpackhi_epi64(c.val, d.val)
934
_mm_unpacklo_pd(c.val, d.val),
935
_mm_unpackhi_pd(c.val, d.val)
963
__m128i p0 = _mm_madd_epi16(a0, b0);
964
__m128i p1 = _mm_madd_epi16(a1, b1);
973
__m128i a0 = _mm_cvtepi8_epi16(a.val);
975
__m128i b0 = _mm_cvtepi8_epi16(b.val);
977
__m128i p0 = _mm_madd_epi16(a0, b0);
978
__m128i p1 = _mm_madd_epi16(a1, b1);
1020
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1021
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1022
OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1023
OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1024
inline _Tpvec operator ~ (const _Tpvec& a) \
1026
return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1029OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint8x16, si128, _mm_set1_epi32(-1))
1030OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int8x16, si128, _mm_set1_epi32(-1))
1031OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint16x8, si128, _mm_set1_epi32(-1))
1032OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int16x8, si128, _mm_set1_epi32(-1))
1033OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint32x4, si128, _mm_set1_epi32(-1))
1034OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int32x4, si128, _mm_set1_epi32(-1))
1035OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint64x2, si128, _mm_set1_epi32(-1))
1036OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int64x2, si128, _mm_set1_epi32(-1))
1037OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1038OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1045
const
__m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1047
__m128 h = _mm_mul_ps(t, _0_5);
1048
t = _mm_rsqrt_ps(t);
1049
t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1058
const
__m128d v_1 = _mm_set1_pd(1.);
1059
return
v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1062
#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1063
inline _Tpuvec v_abs(const _Tpsvec& x) \
1064
{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1070
__m128i s = _mm_srli_epi32(x.val, 31);
1071
__m128i f = _mm_srai_epi32(x.val, 31);
1072
return
v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1075{
return
v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
1079
_mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1084
#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1085
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1087
return _Tpvec(intrin(a.val, b.val)); \
1090OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_min, _mm_min_epu8)
1091OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_max, _mm_max_epu8)
1092OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_min, _mm_min_epi16)
1093OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_max, _mm_max_epi16)
1094OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float32x4, v_min, _mm_min_ps)
1095OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float32x4, v_max, _mm_max_ps)
1096OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float64x2, v_min, _mm_min_pd)
1097OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float64x2, v_max, _mm_max_pd)
1102
return
v_int8x16(_mm_min_epi8(a.val, b.val));
1104
__m128i delta = _mm_set1_epi8((
char)-128);
1105
return
v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1106
_mm_xor_si128(b.val, delta))));
1112
return
v_int8x16(_mm_max_epi8(a.val, b.val));
1114
__m128i delta = _mm_set1_epi8((
char)-128);
1115
return
v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1116
_mm_xor_si128(b.val, delta))));
1122
return
v_uint16x8(_mm_min_epu16(a.val, b.val));
1124
return
v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1130
return
v_uint16x8(_mm_max_epu16(a.val, b.val));
1132
return
v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1138
return
v_uint32x4(_mm_min_epu32(a.val, b.val));
1140
__m128i delta = _mm_set1_epi32((
int)0x80000000);
1141
__m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1142
return
v_uint32x4(v_select_si128(mask, b.val, a.val));
1148
return
v_uint32x4(_mm_max_epu32(a.val, b.val));
1150
__m128i delta = _mm_set1_epi32((
int)0x80000000);
1151
__m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1152
return
v_uint32x4(v_select_si128(mask, a.val, b.val));
1158
return
v_int32x4(_mm_min_epi32(a.val, b.val));
1160
return
v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1166
return
v_int32x4(_mm_max_epi32(a.val, b.val));
1168
return
v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1172
#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1173
inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1174
{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1175
inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1177
__m128i not_mask = _mm_set1_epi32(-1); \
1178
return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1180
inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1181
{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1182
inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1184
__m128i not_mask = _mm_set1_epi32(-1); \
1185
return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1187
inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1189
__m128i smask = _mm_set1_##suffix(sbit); \
1190
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1192
inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1194
__m128i smask = _mm_set1_##suffix(sbit); \
1195
return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1197
inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1199
__m128i smask = _mm_set1_##suffix(sbit); \
1200
__m128i not_mask = _mm_set1_epi32(-1); \
1201
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1202
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1204
inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1206
__m128i smask = _mm_set1_##suffix(sbit); \
1207
__m128i not_mask = _mm_set1_epi32(-1); \
1208
__m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1209
return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1211
inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1213
return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1215
inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1217
return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1219
inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1221
__m128i not_mask = _mm_set1_epi32(-1); \
1222
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1224
inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1226
__m128i not_mask = _mm_set1_epi32(-1); \
1227
return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1234
#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1235
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1236
{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1237
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1238
{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1239
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1240
{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1241
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1242
{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1243
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1244
{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1245
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1246
{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1252
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1253
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1254
{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1255
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1256
{ return ~(a == b); }
1258
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1259
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1260
{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1261
return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1262
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1263
{ return ~(a == b); }
1267OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(
v_int64x2)
1270{
return
v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
1272{
return
v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1274OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_add_wrap, _mm_add_epi8)
1275OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int8x16, v_add_wrap, _mm_add_epi8)
1276OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_add_wrap, _mm_add_epi16)
1277OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_add_wrap, _mm_add_epi16)
1278OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1279OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int8x16, v_sub_wrap, _mm_sub_epi8)
1280OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1281OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_sub_wrap, _mm_sub_epi16)
1282OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1283OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1287
__m128i ad = _mm_srai_epi16(a.val, 8);
1288
__m128i bd = _mm_srai_epi16(b.val, 8);
1289
__m128i p0 = _mm_mullo_epi16(a.val, b.val);
1290
__m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8);
1291
const
__m128i b01 = _mm_set1_epi32(0xFF00FF00);
1292
return
v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1296
return
v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1302{
return
v_add_wrap(a - b, b - a); }
1304{
return
v_add_wrap(a - b, b - a); }
1306{
return
v_max(a, b) - v_min(a, b); }
1312
return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1316
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1322
return
v_reinterpret_as_u32((d ^ m) - m);
1333{
return
v_max(a, b) - v_min(a, b); }
1343
return
v_fma(a, b, c);
1349
return
v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1351
return
v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1358
return
v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1360
return
v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1364
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1365
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1367
_Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1368
return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1370
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1372
_Tpvec res = v_fma(a, a, b*b); \
1373
return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1375
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1377
return v_fma(a, a, b*b); \
1379
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1381
return v_fma(a, b, c); \
1384OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(
v_float32x4,
float, __m128, ps, _mm_set1_epi32((
int)0x7fffffff))
1385OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(
v_float64x2,
double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1387
#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1388
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1390
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1392
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1394
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1396
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1398
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1400
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1402
return _Tpsvec(srai(a.val, imm)); \
1405
inline _Tpuvec v_shl(const _Tpuvec& a) \
1407
return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1410
inline _Tpsvec v_shl(const _Tpsvec& a) \
1412
return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1415
inline _Tpuvec v_shr(const _Tpuvec& a) \
1417
return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1420
inline _Tpsvec v_shr(const _Tpsvec& a) \
1422
return _Tpsvec(srai(a.val, imm)); \
1429
namespace
hal_sse_internal
1432
bool
is_invalid = ((imm < 0) || (imm > 16)),
1433
bool
is_first = (imm == 0),
1434
bool
is_half = (imm == 8),
1435
bool
is_second = (imm == 16),
1436
bool
is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1437
class
v_sse_palignr_u8_class;
1440
class
v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1443
class
v_sse_palignr_u8_class<imm, false, true, false, false, false>
1446
inline
__m128i operator()(
const
__m128i& a,
const
__m128i&)
const
1453
class
v_sse_palignr_u8_class<imm, false, false, true, false, false>
1456
inline
__m128i operator()(
const
__m128i& a,
const
__m128i& b)
const
1458
return
_mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1463
class
v_sse_palignr_u8_class<imm, false, false, false, true, false>
1466
inline
__m128i operator()(
const
__m128i&,
const
__m128i& b)
const
1473
class
v_sse_palignr_u8_class<imm, false, false, false, false, true>
1477
inline
__m128i operator()(
const
__m128i& a,
const
__m128i& b)
const
1479
return
_mm_alignr_epi8(b, a, imm);
1483
inline
__m128i operator()(
const
__m128i& a,
const
__m128i& b)
const
1485
enum
{ imm2 = (
sizeof(__m128i) - imm) };
1486
return
_mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1492
inline
__m128i v_sse_palignr_u8(
const
__m128i& a,
const
__m128i& b)
1494
CV_StaticAssert((imm >= 0) && (imm <= 16),
"Invalid imm for v_sse_palignr_u8.");
1495
return
v_sse_palignr_u8_class<imm>()(a, b);
1499
template<
int
imm,
typename
_Tpvec>
1500
inline
_Tpvec v_rotate_right(
const
_Tpvec &a)
1502
using namespace
hal_sse_internal;
1503
enum
{ imm2 = (imm *
sizeof(
typename
_Tpvec::lane_type)) };
1504
return
_Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1506
v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1509
template<
int
imm,
typename
_Tpvec>
1510
inline
_Tpvec v_rotate_left(
const
_Tpvec &a)
1512
using namespace
hal_sse_internal;
1513
enum
{ imm2 = (imm *
sizeof(
typename
_Tpvec::lane_type)) };
1514
return
_Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1516
v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1519
template<
int
imm,
typename
_Tpvec>
1520
inline
_Tpvec v_rotate_right(
const
_Tpvec &a,
const
_Tpvec &b)
1522
using namespace
hal_sse_internal;
1523
enum
{ imm2 = (imm *
sizeof(
typename
_Tpvec::lane_type)) };
1524
return
_Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1525
v_sse_palignr_u8<imm2>(
1526
v_sse_reinterpret_as<__m128i>(a.val),
1527
v_sse_reinterpret_as<__m128i>(b.val))));
1530
template<
int
imm,
typename
_Tpvec>
1531
inline
_Tpvec v_rotate_left(
const
_Tpvec &a,
const
_Tpvec &b)
1533
using namespace
hal_sse_internal;
1534
enum
{ imm2 = ((_Tpvec::nlanes - imm) *
sizeof(
typename
_Tpvec::lane_type)) };
1535
return
_Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1536
v_sse_palignr_u8<imm2>(
1537
v_sse_reinterpret_as<__m128i>(b.val),
1538
v_sse_reinterpret_as<__m128i>(a.val))));
1541
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1542
inline _Tpvec v_load(const _Tp* ptr) \
1543
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1544
inline _Tpvec v_load_aligned(const _Tp* ptr) \
1545
{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1546
inline _Tpvec v_load_low(const _Tp* ptr) \
1547
{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1548
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1550
return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1551
_mm_loadl_epi64((const __m128i*)ptr1))); \
1553
inline void v_store(_Tp* ptr, const _Tpvec& a) \
1554
{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
1555
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1556
{ _mm_store_si128((__m128i*)ptr, a.val); } \
1557
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1558
{ _mm_stream_si128((__m128i*)ptr, a.val); } \
1559
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1561
if( mode == hal::STORE_UNALIGNED ) \
1562
_mm_storeu_si128((__m128i*)ptr, a.val); \
1563
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1564
_mm_stream_si128((__m128i*)ptr, a.val); \
1566
_mm_store_si128((__m128i*)ptr, a.val); \
1568
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1569
{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
1570
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1571
{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1573OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_uint8x16, uchar)
1574OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int8x16, schar)
1575OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_uint16x8, ushort)
1576OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int16x8,
short)
1577OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_uint32x4,
unsigned)
1578OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int32x4,
int)
1579OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_uint64x2, uint64)
1580OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int64x2, int64)
1582
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1583
inline _Tpvec v_load(const _Tp* ptr) \
1584
{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1585
inline _Tpvec v_load_aligned(const _Tp* ptr) \
1586
{ return _Tpvec(_mm_load_##suffix(ptr)); } \
1587
inline _Tpvec v_load_low(const _Tp* ptr) \
1588
{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1589
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1591
return _Tpvec(_mm_castsi128_##suffix( \
1592
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1593
_mm_loadl_epi64((const __m128i*)ptr1)))); \
1595
inline void v_store(_Tp* ptr, const _Tpvec& a) \
1596
{ _mm_storeu_##suffix(ptr, a.val); } \
1597
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1598
{ _mm_store_##suffix(ptr, a.val); } \
1599
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1600
{ _mm_stream_##suffix(ptr, a.val); } \
1601
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1603
if( mode == hal::STORE_UNALIGNED ) \
1604
_mm_storeu_##suffix(ptr, a.val); \
1605
else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1606
_mm_stream_##suffix(ptr, a.val); \
1608
_mm_store_##suffix(ptr, a.val); \
1610
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1611
{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1612
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1614
__m128i a1 = _mm_cast##suffix##_si128(a.val); \
1615
_mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1618OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(
v_float32x4,
float, ps)
1619OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(
v_float64x2,
double, pd)
1623
__m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1624
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1628
__m128i half = _mm_set1_epi8((schar)-128);
1629
half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1630
return
_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1632
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1633
inline schar v_reduce_##func(const v_int8x16& a) \
1635
__m128i val = a.val; \
1636
__m128i smask = _mm_set1_epi8((schar)-128); \
1637
val = _mm_xor_si128(val, smask); \
1638
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1639
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1640
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1641
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1642
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1644
inline uchar v_reduce_##func(const v_uint8x16& a) \
1646
__m128i val = a.val; \
1647
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1648
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1649
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1650
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1651
return (uchar)_mm_cvtsi128_si32(val); \
1653OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(
max)
1654OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(
min)
1656
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1657
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1659
__m128i val = a.val; \
1660
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1661
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1662
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1663
return (scalartype)_mm_cvtsi128_si32(val); \
1665
inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1667
__m128i val = a.val; \
1668
__m128i smask = _mm_set1_epi16(sbit); \
1669
val = _mm_xor_si128(val, smask); \
1670
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1671
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1672
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1673
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1675OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8,
short,
max, epi16, (
short)-32768)
1676OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8,
short,
min, epi16, (
short)-32768)
1678
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1679
inline scalartype v_reduce_sum(const _Tpvec& a) \
1681
regtype val = a.val; \
1682
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1683
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1684
return (scalartype)_mm_cvt##extract(val); \
1687
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1688
inline scalartype v_reduce_##func(const _Tpvec& a) \
1690
scalartype CV_DECL_ALIGNED(16) buf[4]; \
1691
v_store_aligned(buf, a); \
1692
scalartype s0 = scalar_func(buf[0], buf[1]); \
1693
scalartype s1 = scalar_func(buf[2], buf[3]); \
1694
return scalar_func(s0, s1); \
1697OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_uint32x4,
unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1698OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_int32x4,
int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1699OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_float32x4,
float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1708
uint64 CV_DECL_ALIGNED(32) idx[2];
1710
return idx[0] + idx[1];
1714
int64 CV_DECL_ALIGNED(32) idx[2];
1716
return idx[0] + idx[1];
1720
double
CV_DECL_ALIGNED(32) idx[2];
1722
return idx[0] + idx[1];
1729
__m128 ab = _mm_hadd_ps(a.val, b.val);
1730
__m128 cd = _mm_hadd_ps(c.val, d.val);
1733
__m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1734
__m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1735
return
v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1748
__m128i half = _mm_sad_epu8(a.val, b.val);
1749
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1753
__m128i half = _mm_set1_epi8(0x7f);
1754
half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1755
return
(
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1784
__m128i m1 = _mm_set1_epi32(0x55555555);
1785
__m128i m2 = _mm_set1_epi32(0x33333333);
1786
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1788
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1789
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1790
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1796
p += v_rotate_right<1>(p);
1797
return
v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1802
p += v_rotate_right<1>(p);
1803
p += v_rotate_right<2>(p);
1804
return
v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1811{
return
v_popcount(v_reinterpret_as_u8(a)); }
1813{
return
v_popcount(v_reinterpret_as_u16(a)); }
1815{
return
v_popcount(v_reinterpret_as_u32(a)); }
1817{
return
v_popcount(v_reinterpret_as_u64(a)); }
1819
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1820
inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1821
inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1822
inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1823OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1824OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1825OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint32x4, ps, _mm_castsi128_ps, 15)
1826OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int32x4, ps, _mm_castsi128_ps, 15)
1827OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint64x2, pd, _mm_castsi128_pd, 3)
1828OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int64x2, pd, _mm_castsi128_pd, 3)
1829OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_float32x4, ps, OPENCV_HAL_NOP, 15)
1830OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_float64x2, pd, OPENCV_HAL_NOP, 3)
1832
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1833
inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1834
inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1835
inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
1836OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(
v_uint16x8)
1837OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(
v_int16x8)
1851
#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1852
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1854
return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1857OPENCV_HAL_IMPL_SSE_SELECT(
v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1858OPENCV_HAL_IMPL_SSE_SELECT(
v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1859OPENCV_HAL_IMPL_SSE_SELECT(
v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1860OPENCV_HAL_IMPL_SSE_SELECT(
v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1861OPENCV_HAL_IMPL_SSE_SELECT(
v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1862OPENCV_HAL_IMPL_SSE_SELECT(
v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1865OPENCV_HAL_IMPL_SSE_SELECT(
v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1866OPENCV_HAL_IMPL_SSE_SELECT(
v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1870
#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1871
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1873
return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1877OPENCV_HAL_IMPL_SSE_SELECT(
v_int8x16, si128)
1879OPENCV_HAL_IMPL_SSE_SELECT(
v_int16x8, si128)
1881OPENCV_HAL_IMPL_SSE_SELECT(
v_int32x4, si128)
1889
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1890
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1892
b0.val = intrin(a.val); \
1893
b1.val = __CV_CAT(intrin, _high)(a.val); \
1895
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1896
{ return _Tpwvec(intrin(a.val)); } \
1897
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1898
{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1899
inline _Tpwvec v_load_expand(const _Tp* ptr) \
1901
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1902
return _Tpwvec(intrin(a)); \
1912
#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1913
inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1915
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
1916
return _Tpvec(intrin(a)); \
1919OPENCV_HAL_IMPL_SSE_EXPAND_Q(
v_uint32x4, uchar, _v128_cvtepu8_epi32)
1920OPENCV_HAL_IMPL_SSE_EXPAND_Q(
v_int32x4, schar, _v128_cvtepi8_epi32)
1922
#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1923
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1925
b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1926
b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1928
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1930
__m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1931
return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1933
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1935
__m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1936
return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1938
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1940
__m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1941
c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1942
d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1945OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1946OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1947OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1948OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1949OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1950OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1951OPENCV_HAL_IMPL_SSE_UNPACKS(
v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1952OPENCV_HAL_IMPL_SSE_UNPACKS(
v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1957
static
const
__m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1958
return
v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1960
uchar CV_DECL_ALIGNED(32) d[16];
1962
return
v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1967{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1972
static
const
__m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1973
return
v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1975
__m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1976
r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1977
r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1983{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1987
return
v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1991{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1994{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1998
return
v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
2002{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
2005{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
2007
template<
int
s,
typename
_Tpvec>
2008
inline
_Tpvec
v_extract(
const
_Tpvec& a,
const
_Tpvec& b)
2010
return
v_rotate_right<s>(a, b);
2014{
return
v_int32x4(_mm_cvtps_epi32(a.val)); }
2018
__m128i a1 = _mm_cvtps_epi32(a.val);
2019
__m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2020
return
v_int32x4(_mm_add_epi32(a1, mask));
2025
__m128i a1 = _mm_cvtps_epi32(a.val);
2026
__m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2027
return
v_int32x4(_mm_sub_epi32(a1, mask));
2031{
return
v_int32x4(_mm_cvttps_epi32(a.val)); }
2034{
return
v_int32x4(_mm_cvtpd_epi32(a.val)); }
2038
__m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2039
return
v_int32x4(_mm_unpacklo_epi64(ai, bi));
2044
__m128i a1 = _mm_cvtpd_epi32(a.val);
2045
__m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2046
mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8);
2047
return
v_int32x4(_mm_add_epi32(a1, mask));
2052
__m128i a1 = _mm_cvtpd_epi32(a.val);
2053
__m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2054
mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8);
2055
return
v_int32x4(_mm_sub_epi32(a1, mask));
2059{
return
v_int32x4(_mm_cvttpd_epi32(a.val)); }
2061
#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2062
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2063
const _Tpvec& a2, const _Tpvec& a3, \
2064
_Tpvec& b0, _Tpvec& b1, \
2065
_Tpvec& b2, _Tpvec& b3) \
2067
__m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2068
__m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2069
__m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2070
__m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2072
b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2073
b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2074
b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2075
b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2078OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2079OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2080OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2085
__m128i t00 = _mm_loadu_si128((
const
__m128i*)ptr);
2086
__m128i t01 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2088
__m128i t10 = _mm_unpacklo_epi8(t00, t01);
2089
__m128i t11 = _mm_unpackhi_epi8(t00, t01);
2091
__m128i t20 = _mm_unpacklo_epi8(t10, t11);
2092
__m128i t21 = _mm_unpackhi_epi8(t10, t11);
2094
__m128i t30 = _mm_unpacklo_epi8(t20, t21);
2095
__m128i t31 = _mm_unpackhi_epi8(t20, t21);
2097
a.val = _mm_unpacklo_epi8(t30, t31);
2098
b.val = _mm_unpackhi_epi8(t30, t31);
2104
const
__m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2105
const
__m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2106
__m128i s0 = _mm_loadu_si128((
const
__m128i*)ptr);
2107
__m128i s1 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2108
__m128i s2 = _mm_loadu_si128((
const
__m128i*)(ptr + 32));
2109
__m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2110
__m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2111
__m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2112
const
__m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2113
const
__m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2114
const
__m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2115
a0 = _mm_shuffle_epi8(a0, sh_b);
2116
b0 = _mm_shuffle_epi8(b0, sh_g);
2117
c0 = _mm_shuffle_epi8(c0, sh_r);
2122
const
__m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2123
const
__m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2124
const
__m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2126
__m128i t0 = _mm_loadu_si128((
const
__m128i*)ptr);
2127
__m128i t1 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2128
__m128i t2 = _mm_loadu_si128((
const
__m128i*)(ptr + 32));
2130
__m128i s0 = _mm_shuffle_epi8(t0, m0);
2131
__m128i s1 = _mm_shuffle_epi8(t1, m1);
2132
__m128i s2 = _mm_shuffle_epi8(t2, m2);
2134
t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2135
a.val = _mm_alignr_epi8(s2, t0, 5);
2137
t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2138
b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2140
t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2141
c.val = _mm_alignr_epi8(t2, s0, 11);
2143
__m128i t00 = _mm_loadu_si128((
const
__m128i*)ptr);
2144
__m128i t01 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2145
__m128i t02 = _mm_loadu_si128((
const
__m128i*)(ptr + 32));
2147
__m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2148
__m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2149
__m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2151
__m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2152
__m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2153
__m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2155
__m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2156
__m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2157
__m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2159
a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2160
b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2161
c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2167
__m128i u0 = _mm_loadu_si128((
const
__m128i*)ptr);
2168
__m128i u1 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2169
__m128i u2 = _mm_loadu_si128((
const
__m128i*)(ptr + 32));
2170
__m128i u3 = _mm_loadu_si128((
const
__m128i*)(ptr + 48));
2172
__m128i v0 = _mm_unpacklo_epi8(u0, u2);
2173
__m128i v1 = _mm_unpackhi_epi8(u0, u2);
2174
__m128i v2 = _mm_unpacklo_epi8(u1, u3);
2175
__m128i v3 = _mm_unpackhi_epi8(u1, u3);
2177
u0 = _mm_unpacklo_epi8(v0, v2);
2178
u1 = _mm_unpacklo_epi8(v1, v3);
2179
u2 = _mm_unpackhi_epi8(v0, v2);
2180
u3 = _mm_unpackhi_epi8(v1, v3);
2182
v0 = _mm_unpacklo_epi8(u0, u1);
2183
v1 = _mm_unpacklo_epi8(u2, u3);
2184
v2 = _mm_unpackhi_epi8(u0, u1);
2185
v3 = _mm_unpackhi_epi8(u2, u3);
2187
a.val = _mm_unpacklo_epi8(v0, v1);
2188
b.val = _mm_unpackhi_epi8(v0, v1);
2189
c.val = _mm_unpacklo_epi8(v2, v3);
2190
d.val = _mm_unpackhi_epi8(v2, v3);
2195
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2196
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2198
__m128i v2 = _mm_unpacklo_epi16(v0, v1);
2199
__m128i v3 = _mm_unpackhi_epi16(v0, v1);
2200
__m128i v4 = _mm_unpacklo_epi16(v2, v3);
2201
__m128i v5 = _mm_unpackhi_epi16(v2, v3);
2203
a.val = _mm_unpacklo_epi16(v4, v5);
2204
b.val = _mm_unpackhi_epi16(v4, v5);
2210
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2211
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2212
__m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2213
__m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2214
__m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2215
__m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2217
const
__m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2218
const
__m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2219
const
__m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2220
a0 = _mm_shuffle_epi8(a0, sh_a);
2221
b0 = _mm_shuffle_epi8(b0, sh_b);
2222
c0 = _mm_shuffle_epi8(c0, sh_c);
2228
__m128i t00 = _mm_loadu_si128((
const
__m128i*)ptr);
2229
__m128i t01 = _mm_loadu_si128((
const
__m128i*)(ptr + 8));
2230
__m128i t02 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2232
__m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2233
__m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2234
__m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2236
__m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2237
__m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2238
__m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2240
a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2241
b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2242
c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2248
__m128i u0 = _mm_loadu_si128((
const
__m128i*)ptr);
2249
__m128i u1 = _mm_loadu_si128((
const
__m128i*)(ptr + 8));
2250
__m128i u2 = _mm_loadu_si128((
const
__m128i*)(ptr + 16));
2251
__m128i u3 = _mm_loadu_si128((
const
__m128i*)(ptr + 24));
2253
__m128i v0 = _mm_unpacklo_epi16(u0, u2);
2254
__m128i v1 = _mm_unpackhi_epi16(u0, u2);
2255
__m128i v2 = _mm_unpacklo_epi16(u1, u3);
2256
__m128i v3 = _mm_unpackhi_epi16(u1, u3);
2258
u0 = _mm_unpacklo_epi16(v0, v2);
2259
u1 = _mm_unpacklo_epi16(v1, v3);
2260
u2 = _mm_unpackhi_epi16(v0, v2);
2261
u3 = _mm_unpackhi_epi16(v1, v3);
2263
a.val = _mm_unpacklo_epi16(u0, u1);
2264
b.val = _mm_unpackhi_epi16(u0, u1);
2265
c.val = _mm_unpacklo_epi16(u2, u3);
2266
d.val = _mm_unpackhi_epi16(u2, u3);
2271
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2272
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4));
2274
__m128i v2 = _mm_unpacklo_epi32(v0, v1);
2275
__m128i v3 = _mm_unpackhi_epi32(v0, v1);
2277
a.val = _mm_unpacklo_epi32(v2, v3);
2278
b.val = _mm_unpackhi_epi32(v2, v3);
2283
__m128i t00 = _mm_loadu_si128((
const
__m128i*)ptr);
2284
__m128i t01 = _mm_loadu_si128((
const
__m128i*)(ptr + 4));
2285
__m128i t02 = _mm_loadu_si128((
const
__m128i*)(ptr + 8));
2287
__m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2288
__m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2289
__m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2291
a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2292
b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2293
c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2298
v_uint32x4
s0(_mm_loadu_si128((
const
__m128i*)ptr));
2299
v_uint32x4
s1(_mm_loadu_si128((
const
__m128i*)(ptr + 4)));
2300
v_uint32x4
s2(_mm_loadu_si128((
const
__m128i*)(ptr + 8)));
2301
v_uint32x4
s3(_mm_loadu_si128((
const
__m128i*)(ptr + 12)));
2308
__m128 u0 = _mm_loadu_ps(ptr);
2309
__m128 u1 = _mm_loadu_ps((ptr + 4));
2311
a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2312
b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1));
2317
__m128 t0 = _mm_loadu_ps(ptr + 0);
2318
__m128 t1 = _mm_loadu_ps(ptr + 4);
2319
__m128 t2 = _mm_loadu_ps(ptr + 8);
2321
__m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2322
a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2324
__m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2325
__m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2326
b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2328
__m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2329
c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2334
__m128 t0 = _mm_loadu_ps(ptr + 0);
2335
__m128 t1 = _mm_loadu_ps(ptr + 4);
2336
__m128 t2 = _mm_loadu_ps(ptr + 8);
2337
__m128 t3 = _mm_loadu_ps(ptr + 12);
2338
__m128 t02lo = _mm_unpacklo_ps(t0, t2);
2339
__m128 t13lo = _mm_unpacklo_ps(t1, t3);
2340
__m128 t02hi = _mm_unpackhi_ps(t0, t2);
2341
__m128 t13hi = _mm_unpackhi_ps(t1, t3);
2342
a.val = _mm_unpacklo_ps(t02lo, t13lo);
2343
b.val = _mm_unpackhi_ps(t02lo, t13lo);
2344
c.val = _mm_unpacklo_ps(t02hi, t13hi);
2345
d.val = _mm_unpackhi_ps(t02hi, t13hi);
2350
__m128i t0 = _mm_loadu_si128((
const
__m128i*)ptr);
2351
__m128i t1 = _mm_loadu_si128((
const
__m128i*)(ptr + 2));
2359
__m128i t0 = _mm_loadu_si128((
const
__m128i*)ptr);
2360
__m128i t1 = _mm_loadu_si128((
const
__m128i*)(ptr + 2));
2361
__m128i t2 = _mm_loadu_si128((
const
__m128i*)(ptr + 4));
2363
t1 = _mm_shuffle_epi32(t1, 0x4e);
2366
b =
v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2373
__m128i t0 = _mm_loadu_si128((
const
__m128i*)ptr);
2374
__m128i t1 = _mm_loadu_si128((
const
__m128i*)(ptr + 2));
2375
__m128i t2 = _mm_loadu_si128((
const
__m128i*)(ptr + 4));
2376
__m128i t3 = _mm_loadu_si128((
const
__m128i*)(ptr + 6));
2387
hal::StoreMode mode = hal::STORE_UNALIGNED)
2389
__m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2390
__m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2392
if( mode == hal::STORE_ALIGNED_NOCACHE )
2394
_mm_stream_si128((__m128i*)(ptr), v0);
2395
_mm_stream_si128((__m128i*)(ptr + 16), v1);
2397
else
if( mode == hal::STORE_ALIGNED )
2399
_mm_store_si128((__m128i*)(ptr), v0);
2400
_mm_store_si128((__m128i*)(ptr + 16), v1);
2404
_mm_storeu_si128((__m128i*)(ptr), v0);
2405
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
2410
const
v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2413
const
__m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2414
const
__m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2415
const
__m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2416
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2417
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2418
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2420
const
__m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2421
const
__m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2422
__m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2423
__m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2424
__m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2426
const
__m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2427
const
__m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2428
const
__m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2430
__m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2431
t0 = _mm_alignr_epi8(c.val, t0, 5);
2432
__m128i v0 = _mm_shuffle_epi8(t0, m0);
2434
__m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2435
t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2436
__m128i v1 = _mm_shuffle_epi8(t1, m1);
2438
__m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2439
t2 = _mm_alignr_epi8(t2, a.val, 11);
2440
__m128i v2 = _mm_shuffle_epi8(t2, m2);
2442
__m128i z = _mm_setzero_si128();
2443
__m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2444
__m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2445
__m128i c0 = _mm_unpacklo_epi8(c.val, z);
2446
__m128i c1 = _mm_unpackhi_epi8(c.val, z);
2448
__m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2449
__m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2450
__m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2451
__m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2453
__m128i p10 = _mm_unpacklo_epi32(p00, p01);
2454
__m128i p11 = _mm_unpackhi_epi32(p00, p01);
2455
__m128i p12 = _mm_unpacklo_epi32(p02, p03);
2456
__m128i p13 = _mm_unpackhi_epi32(p02, p03);
2458
__m128i p20 = _mm_unpacklo_epi64(p10, p11);
2459
__m128i p21 = _mm_unpackhi_epi64(p10, p11);
2460
__m128i p22 = _mm_unpacklo_epi64(p12, p13);
2461
__m128i p23 = _mm_unpackhi_epi64(p12, p13);
2463
p20 = _mm_slli_si128(p20, 1);
2464
p22 = _mm_slli_si128(p22, 1);
2466
__m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2467
__m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2468
__m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2469
__m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2471
__m128i p40 = _mm_unpacklo_epi64(p30, p31);
2472
__m128i p41 = _mm_unpackhi_epi64(p30, p31);
2473
__m128i p42 = _mm_unpacklo_epi64(p32, p33);
2474
__m128i p43 = _mm_unpackhi_epi64(p32, p33);
2476
__m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2477
__m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2478
__m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2481
if( mode == hal::STORE_ALIGNED_NOCACHE )
2483
_mm_stream_si128((__m128i*)(ptr), v0);
2484
_mm_stream_si128((__m128i*)(ptr + 16), v1);
2485
_mm_stream_si128((__m128i*)(ptr + 32), v2);
2487
else
if( mode == hal::STORE_ALIGNED )
2489
_mm_store_si128((__m128i*)(ptr), v0);
2490
_mm_store_si128((__m128i*)(ptr + 16), v1);
2491
_mm_store_si128((__m128i*)(ptr + 32), v2);
2495
_mm_storeu_si128((__m128i*)(ptr), v0);
2496
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
2497
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
2503
hal::StoreMode mode = hal::STORE_UNALIGNED)
2509
__m128i u0 = _mm_unpacklo_epi8(a.val, c.val);
2510
__m128i u1 = _mm_unpackhi_epi8(a.val, c.val);
2511
__m128i u2 = _mm_unpacklo_epi8(b.val, d.val);
2512
__m128i u3 = _mm_unpackhi_epi8(b.val, d.val);
2514
__m128i v0 = _mm_unpacklo_epi8(u0, u2);
2515
__m128i v1 = _mm_unpackhi_epi8(u0, u2);
2516
__m128i v2 = _mm_unpacklo_epi8(u1, u3);
2517
__m128i v3 = _mm_unpackhi_epi8(u1, u3);
2519
if( mode == hal::STORE_ALIGNED_NOCACHE )
2521
_mm_stream_si128((__m128i*)(ptr), v0);
2522
_mm_stream_si128((__m128i*)(ptr + 16), v1);
2523
_mm_stream_si128((__m128i*)(ptr + 32), v2);
2524
_mm_stream_si128((__m128i*)(ptr + 48), v3);
2526
else
if( mode == hal::STORE_ALIGNED )
2528
_mm_store_si128((__m128i*)(ptr), v0);
2529
_mm_store_si128((__m128i*)(ptr + 16), v1);
2530
_mm_store_si128((__m128i*)(ptr + 32), v2);
2531
_mm_store_si128((__m128i*)(ptr + 48), v3);
2535
_mm_storeu_si128((__m128i*)(ptr), v0);
2536
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
2537
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
2538
_mm_storeu_si128((__m128i*)(ptr + 48), v3);
2543
hal::StoreMode mode = hal::STORE_UNALIGNED)
2545
__m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2546
__m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2548
if( mode == hal::STORE_ALIGNED_NOCACHE )
2550
_mm_stream_si128((__m128i*)(ptr), v0);
2551
_mm_stream_si128((__m128i*)(ptr + 8), v1);
2553
else
if( mode == hal::STORE_ALIGNED )
2555
_mm_store_si128((__m128i*)(ptr), v0);
2556
_mm_store_si128((__m128i*)(ptr + 8), v1);
2560
_mm_storeu_si128((__m128i*)(ptr), v0);
2561
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
2567
hal::StoreMode mode = hal::STORE_UNALIGNED)
2570
const
__m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2571
const
__m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2572
const
__m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2573
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2574
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2575
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2577
__m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2578
__m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2579
__m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2581
__m128i z = _mm_setzero_si128();
2582
__m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2583
__m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2584
__m128i c0 = _mm_unpacklo_epi16(c.val, z);
2585
__m128i c1 = _mm_unpackhi_epi16(c.val, z);
2587
__m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2588
__m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2589
__m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2590
__m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2592
__m128i p20 = _mm_unpacklo_epi64(p10, p11);
2593
__m128i p21 = _mm_unpackhi_epi64(p10, p11);
2594
__m128i p22 = _mm_unpacklo_epi64(p12, p13);
2595
__m128i p23 = _mm_unpackhi_epi64(p12, p13);
2597
p20 = _mm_slli_si128(p20, 2);
2598
p22 = _mm_slli_si128(p22, 2);
2600
__m128i p30 = _mm_unpacklo_epi64(p20, p21);
2601
__m128i p31 = _mm_unpackhi_epi64(p20, p21);
2602
__m128i p32 = _mm_unpacklo_epi64(p22, p23);
2603
__m128i p33 = _mm_unpackhi_epi64(p22, p23);
2605
__m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2606
__m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2607
__m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2609
if( mode == hal::STORE_ALIGNED_NOCACHE )
2611
_mm_stream_si128((__m128i*)(ptr), v0);
2612
_mm_stream_si128((__m128i*)(ptr + 8), v1);
2613
_mm_stream_si128((__m128i*)(ptr + 16), v2);
2615
else
if( mode == hal::STORE_ALIGNED )
2617
_mm_store_si128((__m128i*)(ptr), v0);
2618
_mm_store_si128((__m128i*)(ptr + 8), v1);
2619
_mm_store_si128((__m128i*)(ptr + 16), v2);
2623
_mm_storeu_si128((__m128i*)(ptr), v0);
2624
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
2625
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
2631
hal::StoreMode mode = hal::STORE_UNALIGNED)
2637
__m128i u0 = _mm_unpacklo_epi16(a.val, c.val);
2638
__m128i u1 = _mm_unpackhi_epi16(a.val, c.val);
2639
__m128i u2 = _mm_unpacklo_epi16(b.val, d.val);
2640
__m128i u3 = _mm_unpackhi_epi16(b.val, d.val);
2642
__m128i v0 = _mm_unpacklo_epi16(u0, u2);
2643
__m128i v1 = _mm_unpackhi_epi16(u0, u2);
2644
__m128i v2 = _mm_unpacklo_epi16(u1, u3);
2645
__m128i v3 = _mm_unpackhi_epi16(u1, u3);
2647
if( mode == hal::STORE_ALIGNED_NOCACHE )
2649
_mm_stream_si128((__m128i*)(ptr), v0);
2650
_mm_stream_si128((__m128i*)(ptr + 8), v1);
2651
_mm_stream_si128((__m128i*)(ptr + 16), v2);
2652
_mm_stream_si128((__m128i*)(ptr + 24), v3);
2654
else
if( mode == hal::STORE_ALIGNED )
2656
_mm_store_si128((__m128i*)(ptr), v0);
2657
_mm_store_si128((__m128i*)(ptr + 8), v1);
2658
_mm_store_si128((__m128i*)(ptr + 16), v2);
2659
_mm_store_si128((__m128i*)(ptr + 24), v3);
2663
_mm_storeu_si128((__m128i*)(ptr), v0);
2664
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
2665
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
2666
_mm_storeu_si128((__m128i*)(ptr + 24), v3);
2671
hal::StoreMode mode = hal::STORE_UNALIGNED)
2673
__m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2674
__m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2676
if( mode == hal::STORE_ALIGNED_NOCACHE )
2678
_mm_stream_si128((__m128i*)(ptr), v0);
2679
_mm_stream_si128((__m128i*)(ptr + 4), v1);
2681
else
if( mode == hal::STORE_ALIGNED )
2683
_mm_store_si128((__m128i*)(ptr), v0);
2684
_mm_store_si128((__m128i*)(ptr + 4), v1);
2688
_mm_storeu_si128((__m128i*)(ptr), v0);
2689
_mm_storeu_si128((__m128i*)(ptr + 4), v1);
2694
const
v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2696
v_uint32x4
z = v_setzero_u32(), u0, u1, u2, u3;
2699
__m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2700
__m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2701
__m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2703
if( mode == hal::STORE_ALIGNED_NOCACHE )
2705
_mm_stream_si128((__m128i*)(ptr), v0);
2706
_mm_stream_si128((__m128i*)(ptr + 4), v1);
2707
_mm_stream_si128((__m128i*)(ptr + 8), v2);
2709
else
if( mode == hal::STORE_ALIGNED )
2711
_mm_store_si128((__m128i*)(ptr), v0);
2712
_mm_store_si128((__m128i*)(ptr + 4), v1);
2713
_mm_store_si128((__m128i*)(ptr + 8), v2);
2717
_mm_storeu_si128((__m128i*)(ptr), v0);
2718
_mm_storeu_si128((__m128i*)(ptr + 4), v1);
2719
_mm_storeu_si128((__m128i*)(ptr + 8), v2);
2725
hal::StoreMode mode = hal::STORE_UNALIGNED)
2730
if( mode == hal::STORE_ALIGNED_NOCACHE )
2732
_mm_stream_si128((__m128i*)(ptr), v0.val);
2733
_mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2734
_mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2735
_mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2737
else
if( mode == hal::STORE_ALIGNED )
2739
_mm_store_si128((__m128i*)(ptr), v0.val);
2740
_mm_store_si128((__m128i*)(ptr + 4), v1.val);
2741
_mm_store_si128((__m128i*)(ptr + 8), v2.val);
2742
_mm_store_si128((__m128i*)(ptr + 12), v3.val);
2746
_mm_storeu_si128((__m128i*)(ptr), v0.val);
2747
_mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2748
_mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2749
_mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2755
hal::StoreMode mode = hal::STORE_UNALIGNED)
2757
__m128 v0 = _mm_unpacklo_ps(a.val, b.val);
2758
__m128 v1 = _mm_unpackhi_ps(a.val, b.val);
2760
if( mode == hal::STORE_ALIGNED_NOCACHE )
2762
_mm_stream_ps(ptr, v0);
2763
_mm_stream_ps(ptr + 4, v1);
2765
else
if( mode == hal::STORE_ALIGNED )
2767
_mm_store_ps(ptr, v0);
2768
_mm_store_ps(ptr + 4, v1);
2772
_mm_storeu_ps(ptr, v0);
2773
_mm_storeu_ps(ptr + 4, v1);
2778
const
v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2780
__m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2781
__m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2782
__m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2783
__m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2784
__m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2785
__m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2786
__m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2787
__m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2788
__m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2790
if( mode == hal::STORE_ALIGNED_NOCACHE )
2792
_mm_stream_ps(ptr, v0);
2793
_mm_stream_ps(ptr + 4, v1);
2794
_mm_stream_ps(ptr + 8, v2);
2796
else
if( mode == hal::STORE_ALIGNED )
2798
_mm_store_ps(ptr, v0);
2799
_mm_store_ps(ptr + 4, v1);
2800
_mm_store_ps(ptr + 8, v2);
2804
_mm_storeu_ps(ptr, v0);
2805
_mm_storeu_ps(ptr + 4, v1);
2806
_mm_storeu_ps(ptr + 8, v2);
2812
hal::StoreMode mode = hal::STORE_UNALIGNED)
2814
__m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2815
__m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2816
__m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2817
__m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2818
__m128 v0 = _mm_unpacklo_ps(u0, u1);
2819
__m128 v2 = _mm_unpacklo_ps(u2, u3);
2820
__m128 v1 = _mm_unpackhi_ps(u0, u1);
2821
__m128 v3 = _mm_unpackhi_ps(u2, u3);
2823
if( mode == hal::STORE_ALIGNED_NOCACHE )
2825
_mm_stream_ps(ptr, v0);
2826
_mm_stream_ps(ptr + 4, v1);
2827
_mm_stream_ps(ptr + 8, v2);
2828
_mm_stream_ps(ptr + 12, v3);
2830
else
if( mode == hal::STORE_ALIGNED )
2832
_mm_store_ps(ptr, v0);
2833
_mm_store_ps(ptr + 4, v1);
2834
_mm_store_ps(ptr + 8, v2);
2835
_mm_store_ps(ptr + 12, v3);
2839
_mm_storeu_ps(ptr, v0);
2840
_mm_storeu_ps(ptr + 4, v1);
2841
_mm_storeu_ps(ptr + 8, v2);
2842
_mm_storeu_ps(ptr + 12, v3);
2847
hal::StoreMode mode = hal::STORE_UNALIGNED)
2849
__m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2850
__m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2852
if( mode == hal::STORE_ALIGNED_NOCACHE )
2854
_mm_stream_si128((__m128i*)(ptr), v0);
2855
_mm_stream_si128((__m128i*)(ptr + 2), v1);
2857
else
if( mode == hal::STORE_ALIGNED )
2859
_mm_store_si128((__m128i*)(ptr), v0);
2860
_mm_store_si128((__m128i*)(ptr + 2), v1);
2864
_mm_storeu_si128((__m128i*)(ptr), v0);
2865
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
2870
const
v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2872
__m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2873
__m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2874
__m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2876
if( mode == hal::STORE_ALIGNED_NOCACHE )
2878
_mm_stream_si128((__m128i*)(ptr), v0);
2879
_mm_stream_si128((__m128i*)(ptr + 2), v1);
2880
_mm_stream_si128((__m128i*)(ptr + 4), v2);
2882
else
if( mode == hal::STORE_ALIGNED )
2884
_mm_store_si128((__m128i*)(ptr), v0);
2885
_mm_store_si128((__m128i*)(ptr + 2), v1);
2886
_mm_store_si128((__m128i*)(ptr + 4), v2);
2890
_mm_storeu_si128((__m128i*)(ptr), v0);
2891
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
2892
_mm_storeu_si128((__m128i*)(ptr + 4), v2);
2898
hal::StoreMode mode = hal::STORE_UNALIGNED)
2900
__m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2901
__m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2902
__m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2903
__m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2905
if( mode == hal::STORE_ALIGNED_NOCACHE )
2907
_mm_stream_si128((__m128i*)(ptr), v0);
2908
_mm_stream_si128((__m128i*)(ptr + 2), v1);
2909
_mm_stream_si128((__m128i*)(ptr + 4), v2);
2910
_mm_stream_si128((__m128i*)(ptr + 6), v3);
2912
else
if( mode == hal::STORE_ALIGNED )
2914
_mm_store_si128((__m128i*)(ptr), v0);
2915
_mm_store_si128((__m128i*)(ptr + 2), v1);
2916
_mm_store_si128((__m128i*)(ptr + 4), v2);
2917
_mm_store_si128((__m128i*)(ptr + 6), v3);
2921
_mm_storeu_si128((__m128i*)(ptr), v0);
2922
_mm_storeu_si128((__m128i*)(ptr + 2), v1);
2923
_mm_storeu_si128((__m128i*)(ptr + 4), v2);
2924
_mm_storeu_si128((__m128i*)(ptr + 6), v3);
2928
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2929
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2932
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2933
a0 = v_reinterpret_as_##suffix0(a1); \
2934
b0 = v_reinterpret_as_##suffix0(b1); \
2936
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2938
_Tpvec1 a1, b1, c1; \
2939
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2940
a0 = v_reinterpret_as_##suffix0(a1); \
2941
b0 = v_reinterpret_as_##suffix0(b1); \
2942
c0 = v_reinterpret_as_##suffix0(c1); \
2944
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2946
_Tpvec1 a1, b1, c1, d1; \
2947
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2948
a0 = v_reinterpret_as_##suffix0(a1); \
2949
b0 = v_reinterpret_as_##suffix0(b1); \
2950
c0 = v_reinterpret_as_##suffix0(c1); \
2951
d0 = v_reinterpret_as_##suffix0(d1); \
2953
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2954
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2956
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2957
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2958
v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2960
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2961
const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2963
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2964
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2965
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2966
v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2968
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2969
const _Tpvec0& c0, const _Tpvec0& d0, \
2970
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2972
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2973
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2974
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2975
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2976
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2997
return
v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
3007
return
v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3017
return
v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3024
__m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000);
3025
__m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000);
3026
__m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3029
__m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000);
3030
__m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3032
__m128i magic_i_lo = _mm_set1_epi32(0x43300000);
3033
__m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3036
__m128i v_hi = _mm_srli_epi64(v.val, 32);
3038
v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3040
__m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3042
__m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3048
inline
v_int8x16
v_lut(
const
schar* tab,
const
int* idx)
3050
#if defined(_MSC_VER)
3051
return
v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3052
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3055
_mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3056
_mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3060
inline
v_int8x16
v_lut_pairs(
const
schar* tab,
const
int* idx)
3062
#if defined(_MSC_VER)
3063
return
v_int8x16(_mm_setr_epi16(*(
const
short*)(tab + idx[0]), *(
const
short*)(tab + idx[1]), *(
const
short*)(tab + idx[2]), *(
const
short*)(tab + idx[3]),
3064
*(
const
short*)(tab + idx[4]), *(
const
short*)(tab + idx[5]), *(
const
short*)(tab + idx[6]), *(
const
short*)(tab + idx[7])));
3067
_mm_setr_pi16(*(
const
short*)(tab + idx[0]), *(
const
short*)(tab + idx[1]), *(
const
short*)(tab + idx[2]), *(
const
short*)(tab + idx[3])),
3068
_mm_setr_pi16(*(
const
short*)(tab + idx[4]), *(
const
short*)(tab + idx[5]), *(
const
short*)(tab + idx[6]), *(
const
short*)(tab + idx[7]))
3072
inline
v_int8x16
v_lut_quads(
const
schar* tab,
const
int* idx)
3074
#if defined(_MSC_VER)
3075
return
v_int8x16(_mm_setr_epi32(*(
const
int*)(tab + idx[0]), *(
const
int*)(tab + idx[1]),
3076
*(
const
int*)(tab + idx[2]), *(
const
int*)(tab + idx[3])));
3079
_mm_setr_pi32(*(
const
int*)(tab + idx[0]), *(
const
int*)(tab + idx[1])),
3080
_mm_setr_pi32(*(
const
int*)(tab + idx[2]), *(
const
int*)(tab + idx[3]))
3084
inline
v_uint8x16
v_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut((
const
schar *)tab, idx)); }
3085
inline
v_uint8x16
v_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_pairs((
const
schar *)tab, idx)); }
3086
inline
v_uint8x16
v_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_quads((
const
schar *)tab, idx)); }
3088
inline
v_int16x8
v_lut(
const
short* tab,
const
int* idx)
3090
#if defined(_MSC_VER)
3091
return
v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3092
tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3095
_mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3096
_mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3100
inline
v_int16x8
v_lut_pairs(
const
short* tab,
const
int* idx)
3102
#if defined(_MSC_VER)
3103
return
v_int16x8(_mm_setr_epi32(*(
const
int*)(tab + idx[0]), *(
const
int*)(tab + idx[1]),
3104
*(
const
int*)(tab + idx[2]), *(
const
int*)(tab + idx[3])));
3107
_mm_setr_pi32(*(
const
int*)(tab + idx[0]), *(
const
int*)(tab + idx[1])),
3108
_mm_setr_pi32(*(
const
int*)(tab + idx[2]), *(
const
int*)(tab + idx[3]))
3112
inline
v_int16x8
v_lut_quads(
const
short* tab,
const
int* idx)
3114
return
v_int16x8(_mm_set_epi64x(*(
const
int64_t*)(tab + idx[1]), *(
const
int64_t*)(tab + idx[0])));
3116
inline
v_uint16x8
v_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut((
const
short
*)tab, idx)); }
3117
inline
v_uint16x8
v_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_pairs((
const
short
*)tab, idx)); }
3118
inline
v_uint16x8
v_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_quads((
const
short
*)tab, idx)); }
3120
inline
v_int32x4
v_lut(
const
int* tab,
const
int* idx)
3122
#if defined(_MSC_VER)
3123
return
v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3124
tab[idx[2]], tab[idx[3]]));
3127
_mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3128
_mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3132
inline
v_int32x4
v_lut_pairs(
const
int* tab,
const
int* idx)
3134
return
v_int32x4(_mm_set_epi64x(*(
const
int64_t*)(tab + idx[1]), *(
const
int64_t*)(tab + idx[0])));
3136
inline
v_int32x4
v_lut_quads(
const
int* tab,
const
int* idx)
3138
return
v_int32x4(_mm_loadu_si128((
const
__m128i*)(tab + idx[0])));
3140
inline
v_uint32x4
v_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idx)); }
3141
inline
v_uint32x4
v_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_pairs((
const
int
*)tab, idx)); }
3142
inline
v_uint32x4
v_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_quads((
const
int
*)tab, idx)); }
3144
inline
v_int64x2
v_lut(
const
int64_t* tab,
const
int* idx)
3146
return
v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3148
inline
v_int64x2
v_lut_pairs(
const
int64_t* tab,
const
int* idx)
3150
return
v_int64x2(_mm_loadu_si128((
const
__m128i*)(tab + idx[0])));
3152
inline
v_uint64x2
v_lut(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut((
const
int64_t *)tab, idx)); }
3153
inline
v_uint64x2
v_lut_pairs(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut_pairs((
const
int64_t *)tab, idx)); }
3155
inline
v_float32x4
v_lut(
const
float* tab,
const
int* idx)
3157
return
v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3159
inline
v_float32x4
v_lut_pairs(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v_lut_pairs((
const
int
*)tab, idx)); }
3160
inline
v_float32x4
v_lut_quads(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v_lut_quads((
const
int
*)tab, idx)); }
3162
inline
v_float64x2
v_lut(
const
double* tab,
const
int* idx)
3164
return
v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3166
inline
v_float64x2
v_lut_pairs(
const
double* tab,
const
int* idx) {
return
v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((
const
__m128i*)(tab + idx[0])))); }
3170
int
CV_DECL_ALIGNED(32) idx[4];
3172
return
v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3177
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idxvec));
3182
int
CV_DECL_ALIGNED(32) idx[4];
3184
return
v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3191
return
v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3202
int
CV_DECL_ALIGNED(32) idx[4];
3204
__m128 z = _mm_setzero_ps();
3205
__m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3206
__m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3207
xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3208
xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3209
__m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3210
__m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3219
__m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3220
__m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3228
return
v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3230
__m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3231
a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3232
a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3233
return
v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3236
inline
v_uint8x16
v_interleave_pairs(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
3240
return
v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3242
__m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3243
return
v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3246
inline
v_uint8x16
v_interleave_quads(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3251
return
v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3253
__m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3254
return
v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3257
inline
v_uint16x8
v_interleave_pairs(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
3261
return
v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3263
return
v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3266
inline
v_uint16x8
v_interleave_quads(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3270
return
v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3272
inline
v_uint32x4
v_interleave_pairs(
const
v_uint32x4& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3273
inline
v_float32x4
v_interleave_pairs(
const
v_float32x4& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3278
return
v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3280
__m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3281
__m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3282
return
v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3285
inline
v_uint8x16
v_pack_triplets(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3290
return
v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3292
return
v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3295
inline
v_uint16x8
v_pack_triplets(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3305
return
(uchar)_mm_extract_epi8(v.val, i);
3307
return
v_rotate_right<i>(v).get0();
3314
return
(schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3320
return
(ushort)_mm_extract_epi16(v.val, i);
3326
return
(
short)v_extract_n<i>(v_reinterpret_as_u16(v));
3333
return
(uint)_mm_extract_epi32(v.val, i);
3335
return
v_rotate_right<i>(v).get0();
3342
return
(
int)v_extract_n<i>(v_reinterpret_as_u32(v));
3348
#ifdef CV__SIMD_NATIVE_mm_extract_epi64
3349
return
(uint64)_v128_extract_epi64<i>(v.val);
3351
return
v_rotate_right<i>(v).get0();
3358
return
(int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3364
union
{ uint iv;
float
fv; } d;
3365
d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3372
union
{ uint64 iv;
double
dv; } d;
3373
d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3380
return
v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3386
return
v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3392
return
v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((
char)i,(
char)i,(
char)i,(
char)i)));
3400
return
v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((
const
__m128i*)ptr)));
3402
const
__m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3403
const
__m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3404
const
__m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3405
__m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((
const
__m128i*)ptr));
3406
__m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3407
__m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta);
3408
__m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3410
t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3411
__m128i zmask = _mm_cmpeq_epi32(e, z);
3412
__m128i ft = v_select_si128(zmask, zt, t);
3413
return
v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3417
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
3420
__m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3421
_mm_storel_epi64((__m128i*)ptr, fp16_value);
3423
const
__m128i signmask = _mm_set1_epi32(0x80000000);
3424
const
__m128i rval = _mm_set1_epi32(0x3f000000);
3426
__m128i t = _mm_castps_si128(v.val);
3427
__m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3428
t = _mm_andnot_si128(signmask, t);
3430
__m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3431
__m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3432
__m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3433
__m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3434
__m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3435
tt = _mm_sub_epi32(tt, rval);
3436
__m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3437
__m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3438
nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3439
t = v_select_si128(tinymask, tt, nt);
3440
t = v_select_si128(finitemask, t, naninf);
3441
t = _mm_or_si128(t, sign);
3442
t = _mm_packs_epi32(t, t);
3443
_mm_storel_epi64((__m128i*)ptr, t);
3447
inline
void
v_cleanup() {}
3449CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition:
intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition:
intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition:
intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition:
intrin_cpp.hpp:492
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition:
intrin_cpp.hpp:2219
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition:
intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition:
intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition:
intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition:
intrin_cpp.hpp:2416
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type
Definition:
intrin_cpp.hpp:1499
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition:
intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition:
intrin_cpp.hpp:2118
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition:
intrin_cpp.hpp:2764
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition:
intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type
Definition:
intrin_cpp.hpp:1518
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition:
intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition:
intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition:
intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition:
intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition:
intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition:
intrin_cpp.hpp:2374
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition:
intrin_cpp.hpp:2046
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75