45
#ifndef OPENCV_HAL_INTRIN_NEON_HPP
46
#define OPENCV_HAL_INTRIN_NEON_HPP
49
#include "opencv2/core/utility.hpp"
56CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
59
#if defined(__aarch64__) || defined(_M_ARM64)
60
#define CV_SIMD128_64F 1
62
#define CV_SIMD128_64F 0
75
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76
#define CV_NEON_AARCH64 1
78
#define CV_NEON_AARCH64 0
87
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
88
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
89
{ c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
90
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
91
inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
92
{ c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
94
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
95
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
96
{ _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
97
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
98
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
99
{ _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
103
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
104
template <typename T> static inline \
105
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
106
template <typename T> static inline \
107
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
109
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
112
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
113
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
114
OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
115
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
117
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
118
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
120
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
121
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
123OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
124OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
125OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
126OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
127OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
128OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
129OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
130OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
131OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
133OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
140
typedef
uchar lane_type;
141
enum
{ nlanes = 16 };
144
explicit
v_uint8x16(uint8x16_t v) : val(v) {}
145
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
146
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
148
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
153
return
vgetq_lane_u8(val, 0);
161
typedef
schar lane_type;
162
enum
{ nlanes = 16 };
165
explicit
v_int8x16(int8x16_t v) : val(v) {}
166
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
167
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
169
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
174
return
vgetq_lane_s8(val, 0);
182
typedef
ushort lane_type;
186
explicit
v_uint16x8(uint16x8_t v) : val(v) {}
187
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
189
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
194
return
vgetq_lane_u16(val, 0);
202
typedef
short
lane_type;
206
explicit
v_int16x8(int16x8_t v) : val(v) {}
207
v_int16x8(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7)
209
short
v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
214
return
vgetq_lane_s16(val, 0);
222
typedef
unsigned
lane_type;
226
explicit
v_uint32x4(uint32x4_t v) : val(v) {}
227
v_uint32x4(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3)
229
unsigned
v[] = {v0, v1, v2, v3};
232
unsigned
get0()
const
234
return
vgetq_lane_u32(val, 0);
242
typedef
int
lane_type;
246
explicit
v_int32x4(int32x4_t v) : val(v) {}
247
v_int32x4(
int
v0,
int
v1,
int
v2,
int
v3)
249
int
v[] = {v0, v1, v2, v3};
254
return
vgetq_lane_s32(val, 0);
261
typedef
float
lane_type;
265
explicit
v_float32x4(float32x4_t v) : val(v) {}
266
v_float32x4(
float
v0,
float
v1,
float
v2,
float
v3)
268
float
v[] = {v0, v1, v2, v3};
273
return
vgetq_lane_f32(val, 0);
280
typedef
uint64 lane_type;
284
explicit
v_uint64x2(uint64x2_t v) : val(v) {}
285
v_uint64x2(uint64 v0, uint64 v1)
287
uint64 v[] = {v0, v1};
292
return
vgetq_lane_u64(val, 0);
299
typedef
int64 lane_type;
303
explicit
v_int64x2(int64x2_t v) : val(v) {}
304
v_int64x2(int64 v0, int64 v1)
306
int64 v[] = {v0, v1};
311
return
vgetq_lane_s64(val, 0);
319
typedef
double
lane_type;
323
explicit
v_float64x2(float64x2_t v) : val(v) {}
324
v_float64x2(
double
v0,
double
v1)
326
double
v[] = {v0, v1};
331
return
vgetq_lane_f64(val, 0);
337
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
338
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
339
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
340
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
341
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
342
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
343
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
344
inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
345
inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
346
inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
347
inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
348
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
349
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
351OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
352OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
353OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
354OPENCV_HAL_IMPL_NEON_INIT(int16x8,
short, s16)
355OPENCV_HAL_IMPL_NEON_INIT(uint32x4,
unsigned, u32)
356OPENCV_HAL_IMPL_NEON_INIT(int32x4,
int, s32)
357OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
358OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
359OPENCV_HAL_IMPL_NEON_INIT(float32x4,
float, f32)
361
#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
362
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
363OPENCV_HAL_IMPL_NEON_INIT(float64x2,
double, f64)
364OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
365OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
366OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
367OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
368OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
369OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
370OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
371OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
372OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
373OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
376
#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
377
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
379
hreg a1 = mov(a.val), b1 = mov(b.val); \
380
return _Tpvec(vcombine_##suffix(a1, b1)); \
382
inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
384
hreg a1 = mov(a.val); \
385
vst1_##suffix(ptr, a1); \
387
template<int n> inline \
388
_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
390
hreg a1 = rshr(a.val, n); \
391
hreg b1 = rshr(b.val, n); \
392
return _Tpvec(vcombine_##suffix(a1, b1)); \
394
template<int n> inline \
395
void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
397
hreg a1 = rshr(a.val, n); \
398
vst1_##suffix(ptr, a1); \
401OPENCV_HAL_IMPL_NEON_PACK(
v_uint8x16, uchar, uint8x8_t, u8,
v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
402OPENCV_HAL_IMPL_NEON_PACK(
v_int8x16, schar, int8x8_t, s8,
v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
403OPENCV_HAL_IMPL_NEON_PACK(
v_uint16x8, ushort, uint16x4_t, u16,
v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
404OPENCV_HAL_IMPL_NEON_PACK(
v_int16x8,
short, int16x4_t, s16,
v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
405OPENCV_HAL_IMPL_NEON_PACK(
v_uint32x4,
unsigned, uint32x2_t, u32,
v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
406OPENCV_HAL_IMPL_NEON_PACK(
v_int32x4,
int, int32x2_t, s32,
v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
408OPENCV_HAL_IMPL_NEON_PACK(
v_uint8x16, uchar, uint8x8_t, u8,
v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
409OPENCV_HAL_IMPL_NEON_PACK(
v_uint16x8, ushort, uint16x4_t, u16,
v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
414
uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
421
uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
422
uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
423
return
v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
430
uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
431
uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
432
uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
433
uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
435
uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
436
uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
437
return
v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
444
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
445
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
446
res = vmlaq_lane_f32(res, m1.val, vl, 1);
447
res = vmlaq_lane_f32(res, m2.val, vh, 0);
448
res = vmlaq_lane_f32(res, m3.val, vh, 1);
456
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
457
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
458
res = vmlaq_lane_f32(res, m1.val, vl, 1);
459
res = vmlaq_lane_f32(res, m2.val, vh, 0);
460
res = vaddq_f32(res, a.val);
464
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
465
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
467
return _Tpvec(intrin(a.val, b.val)); \
469
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
471
a.val = intrin(a.val, b.val); \
475OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_uint8x16, vqaddq_u8)
476OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_uint8x16, vqsubq_u8)
477OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_int8x16, vqaddq_s8)
478OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_int8x16, vqsubq_s8)
479OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_uint16x8, vqaddq_u16)
480OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_uint16x8, vqsubq_u16)
481OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_int16x8, vqaddq_s16)
482OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_int16x8, vqsubq_s16)
483OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_int32x4, vaddq_s32)
484OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_int32x4, vsubq_s32)
485OPENCV_HAL_IMPL_NEON_BIN_OP(*,
v_int32x4, vmulq_s32)
486OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_uint32x4, vaddq_u32)
487OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_uint32x4, vsubq_u32)
488OPENCV_HAL_IMPL_NEON_BIN_OP(*,
v_uint32x4, vmulq_u32)
489OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_float32x4, vaddq_f32)
490OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_float32x4, vsubq_f32)
491OPENCV_HAL_IMPL_NEON_BIN_OP(*,
v_float32x4, vmulq_f32)
492OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_int64x2, vaddq_s64)
493OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_int64x2, vsubq_s64)
494OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_uint64x2, vaddq_u64)
495OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_uint64x2, vsubq_u64)
497OPENCV_HAL_IMPL_NEON_BIN_OP(/,
v_float32x4, vdivq_f32)
498OPENCV_HAL_IMPL_NEON_BIN_OP(+,
v_float64x2, vaddq_f64)
499OPENCV_HAL_IMPL_NEON_BIN_OP(-,
v_float64x2, vsubq_f64)
500OPENCV_HAL_IMPL_NEON_BIN_OP(*,
v_float64x2, vmulq_f64)
501OPENCV_HAL_IMPL_NEON_BIN_OP(/,
v_float64x2, vdivq_f64)
505
float32x4_t reciprocal = vrecpeq_f32(b.val);
506
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
507
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
512
float32x4_t reciprocal = vrecpeq_f32(b.val);
513
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
514
reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
515
a.val = vmulq_f32(a.val, reciprocal);
521
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
522
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
525
v_mul_expand(a, b, c, d); \
526
return v_pack(c, d); \
528
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
529
{ a = a * b; return a; }
540
c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
542
d.val = vmull_high_s8(a.val, b.val);
544
d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
551
c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
553
d.val = vmull_high_u8(a.val, b.val);
555
d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
562
c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
564
d.val = vmull_high_s16(a.val, b.val);
566
d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
573
c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
575
d.val = vmull_high_u16(a.val, b.val);
577
d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
584
c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
586
d.val = vmull_high_u32(a.val, b.val);
588
d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
595
vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
598
vmull_high_s16(a.val, b.val)
600
vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
608
vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
611
vmull_high_u16(a.val, b.val)
613
vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
624
int16x8_t uzp1, uzp2;
625
_v128_unzip(a.val, b.val, uzp1, uzp2);
626
int16x4_t a0 = vget_low_s16(uzp1);
627
int16x4_t b0 = vget_high_s16(uzp1);
628
int16x4_t a1 = vget_low_s16(uzp2);
629
int16x4_t b1 = vget_high_s16(uzp2);
630
int32x4_t p = vmull_s16(a0, b0);
635
int16x8_t uzp1, uzp2;
636
_v128_unzip(a.val, b.val, uzp1, uzp2);
637
int16x4_t a0 = vget_low_s16(uzp1);
638
int16x4_t b0 = vget_high_s16(uzp1);
639
int16x4_t a1 = vget_low_s16(uzp2);
640
int16x4_t b1 = vget_high_s16(uzp2);
641
int32x4_t p = vmlal_s16(c.val, a0, b0);
648
int32x4_t uzp1, uzp2;
649
_v128_unzip(a.val, b.val, uzp1, uzp2);
650
int32x2_t a0 = vget_low_s32(uzp1);
651
int32x2_t b0 = vget_high_s32(uzp1);
652
int32x2_t a1 = vget_low_s32(uzp2);
653
int32x2_t b1 = vget_high_s32(uzp2);
654
int64x2_t p = vmull_s32(a0, b0);
659
int32x4_t uzp1, uzp2;
660
_v128_unzip(a.val, b.val, uzp1, uzp2);
661
int32x2_t a0 = vget_low_s32(uzp1);
662
int32x2_t b0 = vget_high_s32(uzp1);
663
int32x2_t a1 = vget_low_s32(uzp2);
664
int32x2_t b1 = vget_high_s32(uzp2);
665
int64x2_t p = vmlal_s32(c.val, a0, b0);
673
return
v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
675
const
uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
676
const
uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
677
const
uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
678
const
uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
680
uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
681
vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
682
uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
683
vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
685
uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
686
vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
687
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
688
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
696
return
v_uint32x4(vdotq_u32(c.val, a.val, b.val));
705
return
v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
707
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
708
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
709
int16x8_t uzp1, uzp2;
710
_v128_unzip(p0, p1, uzp1, uzp2);
711
int16x8_t sum = vaddq_s16(uzp1, uzp2);
712
int16x4_t uzpl1, uzpl2;
713
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
714
return
v_int32x4(vaddl_s16(uzpl1, uzpl2));
721
return
v_int32x4(vdotq_s32(c.val, a.val, b.val));
730
const
uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
731
const
uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
733
uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
734
vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
735
uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
736
vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
737
uint32x4_t uzp1, uzp2;
738
_v128_unzip(even, odd, uzp1, uzp2);
739
uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
740
uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
748
int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
749
int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
751
int32x4_t uzp1, uzp2;
752
_v128_unzip(p0, p1, uzp1, uzp2);
753
int32x4_t sum = vaddq_s32(uzp1, uzp2);
755
int32x2_t uzpl1, uzpl2;
756
_v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
757
return
v_int64x2(vaddl_s32(uzpl1, uzpl2));
778
int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
779
return
v_int32x4(vmlal_high_s16(p, a.val, b.val));
781
int16x4_t a0 = vget_low_s16(a.val);
782
int16x4_t a1 = vget_high_s16(a.val);
783
int16x4_t b0 = vget_low_s16(b.val);
784
int16x4_t b1 = vget_high_s16(b.val);
785
int32x4_t p = vmull_s16(a0, b0);
792
int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
793
return
v_int32x4(vmlal_high_s16(p, a.val, b.val));
795
int16x4_t a0 = vget_low_s16(a.val);
796
int16x4_t a1 = vget_high_s16(a.val);
797
int16x4_t b0 = vget_low_s16(b.val);
798
int16x4_t b1 = vget_high_s16(b.val);
799
int32x4_t p = vmlal_s16(c.val, a0, b0);
808
int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
809
return
v_int64x2(vmlal_high_s32(p, a.val, b.val));
811
int32x2_t a0 = vget_low_s32(a.val);
812
int32x2_t a1 = vget_high_s32(a.val);
813
int32x2_t b0 = vget_low_s32(b.val);
814
int32x2_t b1 = vget_high_s32(b.val);
815
int64x2_t p = vmull_s32(a0, b0);
822
int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
823
return
v_int64x2(vmlal_high_s32(p, a.val, b.val));
825
int32x2_t a0 = vget_low_s32(a.val);
826
int32x2_t a1 = vget_high_s32(a.val);
827
int32x2_t b0 = vget_low_s32(b.val);
828
int32x2_t b1 = vget_high_s32(b.val);
829
int64x2_t p = vmlal_s32(c.val, a0, b0);
838
return
v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
840
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
841
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
842
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
843
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
850
return
v_uint32x4(vdotq_u32(c.val, a.val, b.val));
859
return
v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
861
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
862
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
863
return
v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
869
return
v_int32x4(vdotq_s32(c.val, a.val, b.val));
878
uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
879
uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
880
uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
881
uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
889
int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
890
prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
891
return
v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
905
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
906
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
907
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
908
OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
909
inline _Tpvec operator ~ (const _Tpvec& a) \
911
return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
915OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int8x16, s8)
917OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int16x8, s16)
919OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int32x4, s32)
921OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int64x2, s64)
923
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
924
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
926
return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
928
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
930
a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
934OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
935OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
936OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
940
return
v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
952
return
one / v_sqrt(x);
957
float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
958
float32x4_t e = vrsqrteq_f32(x1);
959
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
960
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
966
float32x4_t e = vrsqrteq_f32(x.val);
967
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
968
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
973
#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
974
inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
984
#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
985
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
987
return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
989
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
991
a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
995OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
996OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
997OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
1001
return
v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1012
return
one / v_sqrt(x);
1021
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1022
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1024
return _Tpvec(intrin(a.val, b.val)); \
1027OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_min, vminq_u8)
1028OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_max, vmaxq_u8)
1029OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_min, vminq_s8)
1030OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_max, vmaxq_s8)
1031OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_min, vminq_u16)
1032OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_max, vmaxq_u16)
1033OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_min, vminq_s16)
1034OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_max, vmaxq_s16)
1035OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint32x4, v_min, vminq_u32)
1036OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint32x4, v_max, vmaxq_u32)
1037OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int32x4, v_min, vminq_s32)
1038OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int32x4, v_max, vmaxq_s32)
1039OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float32x4, v_min, vminq_f32)
1040OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float32x4, v_max, vmaxq_f32)
1042OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float64x2, v_min, vminq_f64)
1043OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float64x2, v_max, vmaxq_f64)
1047
inline
int64x2_t vmvnq_s64(int64x2_t a)
1049
int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
1050
return
veorq_s64(a, vx);
1052
inline
uint64x2_t vmvnq_u64(uint64x2_t a)
1054
uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1055
return
veorq_u64(a, vx);
1058
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1059
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1060
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1061
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1062
{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1063
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1064
{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1065
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1066
{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1067
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1068
{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1069
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1070
{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1072OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1073OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int8x16, vreinterpretq_s8_u8, s8, u8)
1074OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1075OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int16x8, vreinterpretq_s16_u16, s16, u16)
1076OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1077OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int32x4, vreinterpretq_s32_u32, s32, u32)
1078OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_float32x4, vreinterpretq_f32_u32, f32, u32)
1080OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1081OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int64x2, vreinterpretq_s64_u64, s64, u64)
1082OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_float64x2, vreinterpretq_f64_u64, f64, u64)
1086{
return
v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1089{
return
v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1092OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_add_wrap, vaddq_u8)
1093OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_add_wrap, vaddq_s8)
1094OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_add_wrap, vaddq_u16)
1095OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_add_wrap, vaddq_s16)
1096OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_sub_wrap, vsubq_u8)
1097OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_sub_wrap, vsubq_s8)
1098OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_sub_wrap, vsubq_u16)
1099OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_sub_wrap, vsubq_s16)
1100OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_mul_wrap, vmulq_u8)
1101OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_mul_wrap, vmulq_s8)
1102OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_mul_wrap, vmulq_u16)
1103OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_mul_wrap, vmulq_s16)
1115{
return
v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1117{
return
v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1119
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1120
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1122
return _Tpvec2(cast(intrin(a.val, b.val))); \
1131
v_float32x4
x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1137
return
v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1145
return
v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1147
return
v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1153
return
v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1158
return
v_fma(a, b, c);
1163
return
v_fma(a, b, c);
1169
v_float64x2
x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1175
return
v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1180
return
v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1185
return
v_fma(a, b, c);
1190
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1191
inline _Tpvec operator << (const _Tpvec& a, int n) \
1192
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1193
inline _Tpvec operator >> (const _Tpvec& a, int n) \
1194
{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1195
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1196
{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1197
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1198
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1199
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1200
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1202OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint8x16, u8, schar, s8)
1203OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int8x16, s8, schar, s8)
1204OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint16x8, u16,
short, s16)
1205OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int16x8, s16,
short, s16)
1206OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint32x4, u32,
int, s32)
1207OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int32x4, s32,
int, s32)
1208OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint64x2, u64, int64, s64)
1209OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int64x2, s64, int64, s64)
1211
#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1212
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1213
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1214
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1215
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1216
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1218
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1219
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1220
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1221
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1222
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1223
{ CV_UNUSED(b); return a; }
1225OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint8x16, u8)
1226OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int8x16, s8)
1227OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint16x8, u16)
1228OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int16x8, s16)
1229OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint32x4, u32)
1230OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int32x4, s32)
1232OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint64x2, u64)
1233OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int64x2, s64)
1238
#if defined(__clang__) && defined(__aarch64__)
1240
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1241
inline _Tpvec v_load_low(const _Tp* ptr) \
1243
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1244
uint64 v = *(unaligned_uint64*)ptr; \
1245
return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1248
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1249
inline _Tpvec v_load_low(const _Tp* ptr) \
1250
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1253
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1254
inline _Tpvec v_load(const _Tp* ptr) \
1255
{ return _Tpvec(vld1q_##suffix(ptr)); } \
1256
inline _Tpvec v_load_aligned(const _Tp* ptr) \
1257
{ return _Tpvec(vld1q_##suffix(ptr)); } \
1258
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1259
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1260
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1261
inline void v_store(_Tp* ptr, const _Tpvec& a) \
1262
{ vst1q_##suffix(ptr, a.val); } \
1263
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1264
{ vst1q_##suffix(ptr, a.val); } \
1265
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1266
{ vst1q_##suffix(ptr, a.val); } \
1267
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode
) \
1268
{ vst1q_##suffix(ptr, a.val); } \
1269
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1270
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1271
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1272
{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1274OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_uint8x16, uchar, u8)
1275OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int8x16, schar, s8)
1276OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_uint16x8, ushort, u16)
1277OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int16x8,
short, s16)
1278OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_uint32x4,
unsigned, u32)
1279OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int32x4,
int, s32)
1280OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_uint64x2, uint64, u64)
1281OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int64x2, int64, s64)
1282OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_float32x4,
float, f32)
1284OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_float64x2,
double, f64)
1290
uint16_t t0 = vaddlvq_u8(a.val);
1293
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1294
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1295
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1301
int16_t t0 = vaddlvq_s8(a.val);
1304
int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1305
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1306
return
vget_lane_s32(vpadd_s32(t1, t1), 0);
1312
uint32_t t0 = vaddlvq_u16(a.val);
1315
uint32x4_t t0 = vpaddlq_u16(a.val);
1316
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1317
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1323
int32_t t0 = vaddlvq_s16(a.val);
1326
int32x4_t t0 = vpaddlq_s16(a.val);
1327
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1328
return
vget_lane_s32(vpadd_s32(t1, t1), 0);
1333
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1334
inline scalartype v_reduce_##func(const _Tpvec& a) \
1336
return v##vectorfunc##vq_##suffix(a.val); \
1339
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1340
inline scalartype v_reduce_##func(const _Tpvec& a) \
1342
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1343
a0 = vp##vectorfunc##_##suffix(a0, a0); \
1344
a0 = vp##vectorfunc##_##suffix(a0, a0); \
1345
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1351OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(
v_int8x16, int8x8, schar,
max,
max, s8)
1352OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(
v_int8x16, int8x8, schar,
min,
min, s8)
1355
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1356
inline scalartype v_reduce_##func(const _Tpvec& a) \
1358
return v##vectorfunc##vq_##suffix(a.val); \
1361
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1362
inline scalartype v_reduce_##func(const _Tpvec& a) \
1364
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1365
a0 = vp##vectorfunc##_##suffix(a0, a0); \
1366
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1370OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_uint16x8, uint16x4, ushort,
max,
max, u16)
1371OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_uint16x8, uint16x4, ushort,
min,
min, u16)
1372OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_int16x8, int16x4,
short,
max,
max, s16)
1373OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_int16x8, int16x4,
short,
min,
min, s16)
1376
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1377
inline scalartype v_reduce_##func(const _Tpvec& a) \
1379
return v##vectorfunc##vq_##suffix(a.val); \
1382
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1383
inline scalartype v_reduce_##func(const _Tpvec& a) \
1385
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1386
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1390OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned, sum,
add, u32)
1391OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned,
max,
max, u32)
1392OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned,
min,
min, u32)
1393OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int, sum,
add, s32)
1394OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int,
max,
max, s32)
1395OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int,
min,
min, s32)
1396OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_float32x4, float32x2,
float, sum,
add, f32)
1403
return
vaddvq_u64(a.val);
1405
return
vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1411
return
vaddvq_s64(a.val);
1413
return
vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1419
return
vaddvq_f64(a.val);
1427
float32x4_t ab = vpaddq_f32(a.val, b.val);
1428
float32x4_t cd = vpaddq_f32(c.val, d.val);
1431
float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1432
float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1434
float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]);
1435
float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]);
1437
float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1438
float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1447
uint8x16_t t0 = vabdq_u8(a.val, b.val);
1448
uint16_t t1 = vaddlvq_u8(t0);
1451
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1452
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1453
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1459
uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1460
uint16_t t1 = vaddlvq_u8(t0);
1463
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1464
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1465
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1471
uint16x8_t t0 = vabdq_u16(a.val, b.val);
1472
uint32_t t1 = vaddlvq_u16(t0);
1475
uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1476
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1477
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1483
uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1484
uint32_t t1 = vaddlvq_u16(t0);
1487
uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1488
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1489
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1495
uint32x4_t t0 = vabdq_u32(a.val, b.val);
1496
uint32_t t1 = vaddvq_u32(t0);
1499
uint32x4_t t0 = vabdq_u32(a.val, b.val);
1500
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1501
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1507
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1508
uint32_t t1 = vaddvq_u32(t0);
1511
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1512
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1513
return
vget_lane_u32(vpadd_u32(t1, t1), 0);
1519
float32x4_t t0 = vabdq_f32(a.val, b.val);
1520
return
vaddvq_f32(t0);
1522
float32x4_t t0 = vabdq_f32(a.val, b.val);
1523
float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1524
return
vget_lane_f32(vpadd_f32(t1, t1), 0);
1531{
return
v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1533{
return
v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1535{
return
v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1537{
return
v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1539{
return
v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1541{
return
v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1543{
return
v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1548
const
int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1549
const
uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1550
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1551
uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1552
uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1555
int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1556
uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1557
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1558
return
(
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1563{
return
v_signmask(v_reinterpret_as_u8(a)); }
1568
const
int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1569
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1570
uint32_t t0 = vaddlvq_u16(v0);
1573
int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1574
uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1575
uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1576
return
(
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1580{
return
v_signmask(v_reinterpret_as_u16(a)); }
1585
const
int32x4_t signPosition = {0,1,2,3};
1586
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1587
uint32_t t0 = vaddvq_u32(v0);
1590
int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1591
uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1592
uint64x2_t v1 = vpaddlq_u32(v0);
1593
return
(
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1597{
return
v_signmask(v_reinterpret_as_u32(a)); }
1599{
return
v_signmask(v_reinterpret_as_u32(a)); }
1603
const
int64x2_t signPosition = {0,1};
1604
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1605
uint64_t t0 = vaddvq_u64(v0);
1608
int64x1_t m0 = vdup_n_s64(0);
1609
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1610
return
(
int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1614{
return
v_signmask(v_reinterpret_as_u64(a)); }
1617{
return
v_signmask(v_reinterpret_as_u64(a)); }
1634
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1635
inline bool v_check_all(const v_##_Tpvec& a) \
1637
return (vminvq_##suffix(a.val) >> shift) != 0; \
1639
inline bool v_check_any(const v_##_Tpvec& a) \
1641
return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1644
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1645
inline bool v_check_all(const v_##_Tpvec& a) \
1647
_Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1648
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1649
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1651
inline bool v_check_any(const v_##_Tpvec& a) \
1653
_Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1654
uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1655
return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1659OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1660OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1661OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1665
uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1666
return
(vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1670
uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1671
return
(vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1703
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1704
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1706
return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1709OPENCV_HAL_IMPL_NEON_SELECT(
v_uint8x16, u8, u8)
1710OPENCV_HAL_IMPL_NEON_SELECT(
v_int8x16, s8, u8)
1711OPENCV_HAL_IMPL_NEON_SELECT(
v_uint16x8, u16, u16)
1712OPENCV_HAL_IMPL_NEON_SELECT(
v_int16x8, s16, u16)
1713OPENCV_HAL_IMPL_NEON_SELECT(
v_uint32x4, u32, u32)
1714OPENCV_HAL_IMPL_NEON_SELECT(
v_int32x4, s32, u32)
1721
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1722
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1724
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1725
b1.val = vmovl_high_##suffix(a.val); \
1727
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1729
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1731
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1733
return _Tpwvec(vmovl_high_##suffix(a.val)); \
1735
inline _Tpwvec v_load_expand(const _Tp* ptr) \
1737
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1740
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1741
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1743
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1744
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1746
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1748
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1750
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1752
return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1754
inline _Tpwvec v_load_expand(const _Tp* ptr) \
1756
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1769
typedef
unsigned
int
CV_DECL_ALIGNED(1) unaligned_uint;
1770
uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1771
uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1777
typedef
unsigned
int
CV_DECL_ALIGNED(1) unaligned_uint;
1778
int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1779
int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1783
#if defined(__aarch64__) || defined(_M_ARM64)
1784
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1785
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1787
b0.val = vzip1q_##suffix(a0.val, a1.val); \
1788
b1.val = vzip2q_##suffix(a0.val, a1.val); \
1790
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1792
return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1794
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1796
return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1798
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1800
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1801
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1804
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1805
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1807
_Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1808
b0.val = p.val[0]; \
1809
b1.val = p.val[1]; \
1811
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1813
return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1815
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1817
return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1819
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1821
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1822
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1826OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1827OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1828OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1829OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1830OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1831OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1832OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1834OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1839
uint8x16_t vec = vrev64q_u8(a.val);
1844{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1848
uint16x8_t vec = vrev64q_u16(a.val);
1853{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1857
uint32x4_t vec = vrev64q_u32(a.val);
1862{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1865{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1869
uint64x2_t vec = a.val;
1870
uint64x1_t vec_lo = vget_low_u64(vec);
1871
uint64x1_t vec_hi = vget_high_u64(vec);
1872
return
v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1876{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1880{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1883
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1885
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1887
return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1890OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1891OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1892OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1893OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1894OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1895OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1896OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1897OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1898OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1900OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1903
#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1904
template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1906OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_uint8x16, uchar, u8)
1907OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int8x16, schar, s8)
1908OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_uint16x8, ushort, u16)
1909OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int16x8,
short, s16)
1910OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_uint32x4, uint, u32)
1911OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int32x4,
int, s32)
1912OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_uint64x2, uint64, u64)
1913OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int64x2, int64, s64)
1914OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_float32x4,
float, f32)
1916OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_float64x2,
double, f64)
1919
#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1920
template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1922OPENCV_HAL_IMPL_NEON_BROADCAST(
v_uint8x16, uchar, u8)
1923OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int8x16, schar, s8)
1924OPENCV_HAL_IMPL_NEON_BROADCAST(
v_uint16x8, ushort, u16)
1925OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int16x8,
short, s16)
1926OPENCV_HAL_IMPL_NEON_BROADCAST(
v_uint32x4, uint, u32)
1927OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int32x4,
int, s32)
1928OPENCV_HAL_IMPL_NEON_BROADCAST(
v_uint64x2, uint64, u64)
1929OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int64x2, int64, s64)
1930OPENCV_HAL_IMPL_NEON_BROADCAST(
v_float32x4,
float, f32)
1932OPENCV_HAL_IMPL_NEON_BROADCAST(
v_float64x2,
double, f64)
1938
float32x4_t a_ = a.val;
1940
__asm__ (
"fcvtns %0.4s, %1.4s"
1949
static
const
int32x4_t v_sign = vdupq_n_s32(1 << 31),
1950
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1952
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1953
return
v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1958
int32x4_t a1 = vcvtq_s32_f32(a.val);
1959
uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1960
return
v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1965
int32x4_t a1 = vcvtq_s32_f32(a.val);
1966
uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1967
return
v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1971{
return
v_int32x4(vcvtq_s32_f32(a.val)); }
1976
static
const
int32x2_t zero = vdup_n_s32(0);
1977
return
v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1982
return
v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1987
static
const
int32x2_t zero = vdup_n_s32(0);
1988
int64x2_t a1 = vcvtq_s64_f64(a.val);
1989
uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1990
a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1991
return
v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1996
static
const
int32x2_t zero = vdup_n_s32(0);
1997
int64x2_t a1 = vcvtq_s64_f64(a.val);
1998
uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1999
a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2000
return
v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2005
static
const
int32x2_t zero = vdup_n_s32(0);
2006
return
v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2011
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2012
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2013
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2014
v_##_Tpvec& b0, v_##_Tpvec& b1, \
2015
v_##_Tpvec& b2, v_##_Tpvec& b3) \
2018
_Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2019
vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2020
vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2021
_Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2022
vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2023
vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2024
_Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2025
vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2026
vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2027
_Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2028
vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2029
vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2031
b0.val = vtrn1q_##suffix##32(t0, t1); \
2032
b1.val = vtrn2q_##suffix##32(t0, t1); \
2033
b2.val = vtrn1q_##suffix##32(t2, t3); \
2034
b3.val = vtrn2q_##suffix##32(t2, t3); \
2037OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2038OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2039OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2041
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2042
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2043
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2044
v_##_Tpvec& b0, v_##_Tpvec& b1, \
2045
v_##_Tpvec& b2, v_##_Tpvec& b3) \
2051
_Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2052
_Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2057
b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2058
b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2059
b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2060
b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2063OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2064OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2065OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2068
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2069
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2071
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
2075
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2077
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
2082
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2083
v_##_Tpvec& c, v_##_Tpvec& d) \
2085
_Tpvec##x4_t v = vld4q_##suffix(ptr); \
2091
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2092
hal::StoreMode
=hal::STORE_UNALIGNED) \
2097
vst2q_##suffix(ptr, v); \
2099
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2100
const v_##_Tpvec& c, hal::StoreMode
=hal::STORE_UNALIGNED) \
2106
vst3q_##suffix(ptr, v); \
2108
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2109
const v_##_Tpvec& c, const v_##_Tpvec& d, \
2110
hal::StoreMode
=hal::STORE_UNALIGNED ) \
2117
vst4q_##suffix(ptr, v); \
2120
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2121
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2123
tp##x1_t a0 = vld1_##suffix(ptr); \
2124
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2125
tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2126
tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2127
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2128
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2131
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2132
v_##tp##x2& b, v_##tp##x2& c ) \
2134
tp##x1_t a0 = vld1_##suffix(ptr); \
2135
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2136
tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2137
tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2138
tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2139
tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2140
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2141
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2142
c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2145
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2146
v_##tp##x2& c, v_##tp##x2& d ) \
2148
tp##x1_t a0 = vld1_##suffix(ptr); \
2149
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2150
tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2151
tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2152
tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2153
tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2154
tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2155
tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2156
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2157
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2158
c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2159
d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2162
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2163
hal::StoreMode
=hal::STORE_UNALIGNED) \
2165
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2166
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2167
vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2168
vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2171
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2172
const v_##tp##x2& b, const v_##tp##x2& c, \
2173
hal::StoreMode
=hal::STORE_UNALIGNED) \
2175
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2176
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2177
vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2178
vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2179
vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2180
vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2183
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2184
const v_##tp##x2& c, const v_##tp##x2& d, \
2185
hal::StoreMode
=hal::STORE_UNALIGNED) \
2187
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2188
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2189
vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2190
vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2191
vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2192
vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2193
vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2194
vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2197OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2198OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2199OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2200OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8,
short, s16)
2201OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4,
unsigned, u32)
2202OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4,
int, s32)
2203OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4,
float, f32)
2205OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2,
double, f64)
2208OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2209OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2219
float32x2_t zero = vdup_n_f32(0.0f);
2220
return
v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2225
return
v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2230
return
v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2235
return
v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2240
return
v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2245
return
v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2255
inline
v_int8x16
v_lut(
const
schar* tab,
const
int* idx)
2257
schar CV_DECL_ALIGNED(32) elems[16] =
2278
inline
v_int8x16
v_lut_pairs(
const
schar* tab,
const
int* idx)
2280
schar CV_DECL_ALIGNED(32) elems[16] =
2301
inline
v_int8x16
v_lut_quads(
const
schar* tab,
const
int* idx)
2303
schar CV_DECL_ALIGNED(32) elems[16] =
2324
inline
v_uint8x16
v_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2325
inline
v_uint8x16
v_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2326
inline
v_uint8x16
v_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2328
inline
v_int16x8
v_lut(
const
short* tab,
const
int* idx)
2330
short
CV_DECL_ALIGNED(32) elems[8] =
2343
inline
v_int16x8
v_lut_pairs(
const
short* tab,
const
int* idx)
2345
short
CV_DECL_ALIGNED(32) elems[8] =
2358
inline
v_int16x8
v_lut_quads(
const
short* tab,
const
int* idx)
2360
return
v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2362
inline
v_uint16x8
v_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut((
short*)tab, idx)); }
2363
inline
v_uint16x8
v_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_pairs((
short*)tab, idx)); }
2364
inline
v_uint16x8
v_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_quads((
short*)tab, idx)); }
2366
inline
v_int32x4
v_lut(
const
int* tab,
const
int* idx)
2368
int
CV_DECL_ALIGNED(32) elems[4] =
2377
inline
v_int32x4
v_lut_pairs(
const
int* tab,
const
int* idx)
2379
return
v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2381
inline
v_int32x4
v_lut_quads(
const
int* tab,
const
int* idx)
2383
return
v_int32x4(vld1q_s32(tab + idx[0]));
2385
inline
v_uint32x4
v_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut((
int*)tab, idx)); }
2386
inline
v_uint32x4
v_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_pairs((
int*)tab, idx)); }
2387
inline
v_uint32x4
v_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_quads((
int*)tab, idx)); }
2389
inline
v_int64x2
v_lut(
const
int64_t* tab,
const
int* idx)
2391
return
v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2393
inline
v_int64x2
v_lut_pairs(
const
int64_t* tab,
const
int* idx)
2395
return
v_int64x2(vld1q_s64(tab + idx[0]));
2397
inline
v_uint64x2
v_lut(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut((
const
int64_t *)tab, idx)); }
2398
inline
v_uint64x2
v_lut_pairs(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut_pairs((
const
int64_t *)tab, idx)); }
2400
inline
v_float32x4
v_lut(
const
float* tab,
const
int* idx)
2402
float
CV_DECL_ALIGNED(32) elems[4] =
2411
inline
v_float32x4
v_lut_pairs(
const
float* tab,
const
int* idx)
2413
typedef
uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2415
uint64 CV_DECL_ALIGNED(32) elems[2] =
2417
*(unaligned_uint64*)(tab + idx[0]),
2418
*(unaligned_uint64*)(tab + idx[1])
2420
return
v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2422
inline
v_float32x4
v_lut_quads(
const
float* tab,
const
int* idx)
2429
int
CV_DECL_ALIGNED(32) elems[4] =
2431
tab[vgetq_lane_s32(idxvec.val, 0)],
2432
tab[vgetq_lane_s32(idxvec.val, 1)],
2433
tab[vgetq_lane_s32(idxvec.val, 2)],
2434
tab[vgetq_lane_s32(idxvec.val, 3)]
2441
unsigned
CV_DECL_ALIGNED(32) elems[4] =
2443
tab[vgetq_lane_s32(idxvec.val, 0)],
2444
tab[vgetq_lane_s32(idxvec.val, 1)],
2445
tab[vgetq_lane_s32(idxvec.val, 2)],
2446
tab[vgetq_lane_s32(idxvec.val, 3)]
2453
float
CV_DECL_ALIGNED(32) elems[4] =
2455
tab[vgetq_lane_s32(idxvec.val, 0)],
2456
tab[vgetq_lane_s32(idxvec.val, 1)],
2457
tab[vgetq_lane_s32(idxvec.val, 2)],
2458
tab[vgetq_lane_s32(idxvec.val, 3)]
2474
int
CV_DECL_ALIGNED(32) idx[4];
2477
x =
v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2478
y =
v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2483
return
v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2485
inline
v_uint8x16
v_interleave_pairs(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2488
return
v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2490
inline
v_uint8x16
v_interleave_quads(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2494
return
v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2496
inline
v_uint16x8
v_interleave_pairs(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2499
int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2500
return
v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2502
inline
v_uint16x8
v_interleave_quads(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2506
int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2507
return
v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2509
inline
v_uint32x4
v_interleave_pairs(
const
v_uint32x4& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2510
inline
v_float32x4
v_interleave_pairs(
const
v_float32x4& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2514
return
v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2516
inline
v_uint8x16
v_pack_triplets(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2520
return
v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2522
inline
v_uint16x8
v_pack_triplets(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2529
inline
v_float64x2
v_lut(
const
double* tab,
const
int* idx)
2531
double
CV_DECL_ALIGNED(32) elems[2] =
2539
inline
v_float64x2
v_lut_pairs(
const
double* tab,
const
int* idx)
2546
double
CV_DECL_ALIGNED(32) elems[2] =
2548
tab[vgetq_lane_s32(idxvec.val, 0)],
2549
tab[vgetq_lane_s32(idxvec.val, 1)],
2556
int
CV_DECL_ALIGNED(32) idx[4];
2570
(float16x4_t)vld1_s16((
const
short*)ptr);
2572
vld1_f16((
const
__fp16*)ptr);
2577
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
2579
float16x4_t hv = vcvt_f16_f32(v.val);
2582
vst1_s16((
short*)ptr, (int16x4_t)hv);
2584
vst1_f16((__fp16*)ptr, hv);
2592
for(
int
i = 0; i < N; i++ ) buf[i] = (
float)ptr[i];
2596
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
2601
for(
int
i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2605
inline
void
v_cleanup() {}
2607CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition:
intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition:
intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition:
intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition:
intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition:
intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition:
intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition:
intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition:
intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition:
intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition:
intrin_cpp.hpp:496
CV_INLINE v_reg< _Tp, n > operator/(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Divide values
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition:
intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition:
intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition:
intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition:
intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition:
intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition:
intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition:
intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition:
intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition:
intrin_cpp.hpp:502
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75