5
#ifndef OPENCV_HAL_INTRIN_MSA_HPP
6
#define OPENCV_HAL_INTRIN_MSA_HPP
9
#include "opencv2/core/utility.hpp"
15CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21
#define CV_SIMD128_64F 1
25
typedef
uchar lane_type;
29
explicit
v_uint8x16(v16u8 v) : val(v) {}
30
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
31
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
39
return
msa_getq_lane_u8(val, 0);
47
typedef
schar lane_type;
51
explicit
v_int8x16(v16i8 v) : val(v) {}
52
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
55
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
61
return
msa_getq_lane_s8(val, 0);
69
typedef
ushort lane_type;
73
explicit
v_uint16x8(v8u16 v) : val(v) {}
74
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
76
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
77
val = msa_ld1q_u16(v);
82
return
msa_getq_lane_u16(val, 0);
90
typedef
short
lane_type;
94
explicit
v_int16x8(v8i16 v) : val(v) {}
95
v_int16x8(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7)
97
short
v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
98
val = msa_ld1q_s16(v);
103
return
msa_getq_lane_s16(val, 0);
111
typedef
unsigned
int
lane_type;
115
explicit
v_uint32x4(v4u32 v) : val(v) {}
116
v_uint32x4(
unsigned
int
v0,
unsigned
int
v1,
unsigned
int
v2,
unsigned
int
v3)
118
unsigned
int
v[] = {v0, v1, v2, v3};
119
val = msa_ld1q_u32(v);
122
unsigned
int
get0()
const
124
return
msa_getq_lane_u32(val, 0);
132
typedef
int
lane_type;
136
explicit
v_int32x4(v4i32 v) : val(v) {}
137
v_int32x4(
int
v0,
int
v1,
int
v2,
int
v3)
139
int
v[] = {v0, v1, v2, v3};
140
val = msa_ld1q_s32(v);
145
return
msa_getq_lane_s32(val, 0);
153
typedef
float
lane_type;
157
explicit
v_float32x4(v4f32 v) : val(v) {}
158
v_float32x4(
float
v0,
float
v1,
float
v2,
float
v3)
160
float
v[] = {v0, v1, v2, v3};
161
val = msa_ld1q_f32(v);
166
return
msa_getq_lane_f32(val, 0);
174
typedef
uint64 lane_type;
178
explicit
v_uint64x2(v2u64 v) : val(v) {}
179
v_uint64x2(uint64 v0, uint64 v1)
181
uint64 v[] = {v0, v1};
182
val = msa_ld1q_u64(v);
187
return
msa_getq_lane_u64(val, 0);
195
typedef
int64 lane_type;
199
explicit
v_int64x2(v2i64 v) : val(v) {}
200
v_int64x2(int64 v0, int64 v1)
202
int64 v[] = {v0, v1};
203
val = msa_ld1q_s64(v);
208
return
msa_getq_lane_s64(val, 0);
216
typedef
double
lane_type;
220
explicit
v_float64x2(v2f64 v) : val(v) {}
221
v_float64x2(
double
v0,
double
v1)
223
double
v[] = {v0, v1};
224
val = msa_ld1q_f64(v);
229
return
msa_getq_lane_f64(val, 0);
235
#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
236
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
237
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
238
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
239
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
240
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
241
inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
242
inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
243
inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
244
inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
245
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
246
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
247
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
249OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
250OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
251OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
252OPENCV_HAL_IMPL_MSA_INIT(int16x8,
short, s16)
253OPENCV_HAL_IMPL_MSA_INIT(uint32x4,
unsigned
int, u32)
254OPENCV_HAL_IMPL_MSA_INIT(int32x4,
int, s32)
255OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
256OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
257OPENCV_HAL_IMPL_MSA_INIT(float32x4,
float, f32)
258OPENCV_HAL_IMPL_MSA_INIT(float64x2,
double, f64)
260
#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
261
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
263
return _Tpvec(mov(a.val, b.val)); \
265
template<int n> inline \
266
_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
268
return _Tpvec(rshr(a.val, b.val, n)); \
280
#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
281
inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
283
hreg a1 = mov(a.val); \
284
msa_st1_##suffix(ptr, a1); \
286
template<int n> inline \
287
void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
289
hreg a1 = rshr(a.val, n); \
290
msa_st1_##suffix(ptr, a1); \
293OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint8x16, uchar, v8u8, u8,
v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
294OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int8x16, schar, v8i8, s8,
v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
295OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint16x8, ushort, v4u16, u16,
v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
296OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int16x8,
short, v4i16, s16,
v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
297OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint32x4,
unsigned, v2u32, u32,
v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
298OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int32x4,
int, v2i32, s32,
v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
299OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint8x16, uchar, v8u8, u8,
v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
300OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint16x8, ushort, v4u16, u16,
v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
305
return
v_uint8x16(msa_pack_u16(a.val, b.val));
311
return
v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
318
v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
319
v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
328
v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
329
res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
330
res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
331
res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
340
v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
341
res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
342
res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
343
res = msa_addq_f32(res, a.val);
347
#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
348
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
350
return _Tpvec(intrin(a.val, b.val)); \
352
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
354
a.val = intrin(a.val, b.val); \
358OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint8x16, msa_qaddq_u8)
359OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint8x16, msa_qsubq_u8)
360OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int8x16, msa_qaddq_s8)
361OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int8x16, msa_qsubq_s8)
362OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint16x8, msa_qaddq_u16)
363OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint16x8, msa_qsubq_u16)
364OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int16x8, msa_qaddq_s16)
365OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int16x8, msa_qsubq_s16)
366OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int32x4, msa_addq_s32)
367OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int32x4, msa_subq_s32)
368OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_int32x4, msa_mulq_s32)
369OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint32x4, msa_addq_u32)
370OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint32x4, msa_subq_u32)
371OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_uint32x4, msa_mulq_u32)
372OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_float32x4, msa_addq_f32)
373OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_float32x4, msa_subq_f32)
374OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_float32x4, msa_mulq_f32)
375OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int64x2, msa_addq_s64)
376OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int64x2, msa_subq_s64)
377OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint64x2, msa_addq_u64)
378OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint64x2, msa_subq_u64)
379OPENCV_HAL_IMPL_MSA_BIN_OP(/,
v_float32x4, msa_divq_f32)
380OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_float64x2, msa_addq_f64)
381OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_float64x2, msa_subq_f64)
382OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_float64x2, msa_mulq_f64)
383OPENCV_HAL_IMPL_MSA_BIN_OP(/,
v_float64x2, msa_divq_f64)
386
#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
387
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
390
v_mul_expand(a, b, c, d); \
391
return v_pack(c, d); \
393
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
394
{a = a * b; return a; }
405
v16i8 a_lo, a_hi, b_lo, b_hi;
407
ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
408
ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
409
c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
410
d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
416
v16u8 a_lo, a_hi, b_lo, b_hi;
418
ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
419
ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
420
c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
421
d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
427
v8i16 a_lo, a_hi, b_lo, b_hi;
429
ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
430
ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
431
c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
432
d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
438
v8u16 a_lo, a_hi, b_lo, b_hi;
440
ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
441
ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
442
c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
443
d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
449
v4u32 a_lo, a_hi, b_lo, b_hi;
451
ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
452
ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
453
c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
454
d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
459
v8i16 a_lo, a_hi, b_lo, b_hi;
461
ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
462
ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
464
return
v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
465
msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
470
v8u16 a_lo, a_hi, b_lo, b_hi;
472
ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
473
ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
475
return
v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
476
msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
483{
return
v_int32x4(msa_dotp_s_w(a.val, b.val)); }
485{
return
v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
489{
return
v_int64x2(msa_dotp_s_d(a.val, b.val)); }
491{
return
v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
496
v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
497
v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
498
v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
499
v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
500
v4u32 prod = msa_dotp_u_w(even_a, even_b);
501
return
v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
505
v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
506
v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
507
v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
508
v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
509
v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
510
return
v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
515
v8i16 prod = msa_dotp_s_h(a.val, b.val);
516
return
v_int32x4(msa_hadd_s32(prod, prod));
525
v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
526
v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
527
v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
528
v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
529
v2u64 prod = msa_dotp_u_d(even_a, even_b);
530
return
v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
535
v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
536
v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
537
v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
538
v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
539
v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
540
return
v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
545
v4i32 prod = msa_dotp_s_w(a.val, b.val);
546
return
v_int64x2(msa_hadd_s64(prod, prod));
598
#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
599
OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
600
OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
601
OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
602
inline _Tpvec operator ~ (const _Tpvec& a) \
604
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
607OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint8x16, v16u8, u8)
608OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int8x16, v16i8, s8)
609OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint16x8, v8u16, u16)
610OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int16x8, v8i16, s16)
611OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint32x4, v4u32, u32)
612OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int32x4, v4i32, s32)
613OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint64x2, v2u64, u64)
614OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int64x2, v2i64, s64)
616
#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
617
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
619
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
621
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
623
a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
627OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
628OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
629OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
633
return
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
637
#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
638
inline _Tpuvec v_abs(const _Tpsvec& a) \
640
return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
648
#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
649
inline _Tpvec func(const _Tpvec& a) \
651
return _Tpvec(intrin(a.val)); \
654OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_abs, msa_absq_f32)
655OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_abs, msa_absq_f64)
656OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_sqrt, msa_sqrtq_f32)
658OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_sqrt, msa_sqrtq_f64)
661
#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
662
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
664
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
666
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
668
a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
672OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
673OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
674OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
678
return
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
683
#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
684
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
686
return _Tpvec(intrin(a.val, b.val)); \
689OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_min, msa_minq_u8)
690OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_max, msa_maxq_u8)
691OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_min, msa_minq_s8)
692OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_max, msa_maxq_s8)
693OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_min, msa_minq_u16)
694OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_max, msa_maxq_u16)
695OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_min, msa_minq_s16)
696OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_max, msa_maxq_s16)
697OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_min, msa_minq_u32)
698OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_max, msa_maxq_u32)
699OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_min, msa_minq_s32)
700OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_max, msa_maxq_s32)
701OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_min, msa_minq_f32)
702OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_max, msa_maxq_f32)
703OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_min, msa_minq_f64)
704OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_max, msa_maxq_f64)
706
#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
707
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
708
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
709
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
710
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
711
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
712
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
713
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
714
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
715
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
716
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
717
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
718
{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
720OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint8x16, v16u8, u8, u8)
721OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int8x16, v16i8, s8, u8)
722OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint16x8, v8u16, u16, u16)
723OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int16x8, v8i16, s16, u16)
724OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint32x4, v4u32, u32, u32)
725OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int32x4, v4i32, s32, u32)
726OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float32x4, v4f32, f32, u32)
727OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint64x2, v2u64, u64, u64)
728OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int64x2, v2i64, s64, u64)
729OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float64x2, v2f64, f64, u64)
732{
return
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
734{
return
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
736OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_add_wrap, msa_addq_u8)
737OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_add_wrap, msa_addq_s8)
738OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_add_wrap, msa_addq_u16)
739OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_add_wrap, msa_addq_s16)
740OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_sub_wrap, msa_subq_u8)
741OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_sub_wrap, msa_subq_s8)
742OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_sub_wrap, msa_subq_u16)
743OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_sub_wrap, msa_subq_s16)
744OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_mul_wrap, msa_mulq_u8)
745OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_mul_wrap, msa_mulq_s8)
746OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_mul_wrap, msa_mulq_u16)
747OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_mul_wrap, msa_mulq_s16)
759
#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
760
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
762
return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
772
v_float32x4
x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
778
return
v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
783
return
v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
788
return
v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
793
return
v_fma(a, b, c);
798
return
v_fma(a, b, c);
803
v_float64x2
x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
809
return
v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
814
return
v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
819
return
v_fma(a, b, c);
823
#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
824
inline _Tpvec operator << (const _Tpvec& a, int n) \
825
{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
826
inline _Tpvec operator >> (const _Tpvec& a, int n) \
827
{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
828
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
829
{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
830
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
831
{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
832
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
833
{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
835OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint8x16, u8, schar, s8)
836OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int8x16, s8, schar, s8)
837OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint16x8, u16,
short, s16)
838OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int16x8, s16,
short, s16)
839OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint32x4, u32,
int, s32)
840OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int32x4, s32,
int, s32)
841OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint64x2, u64, int64, s64)
842OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int64x2, s64, int64, s64)
845
#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
846
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
848
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
850
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
852
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
854
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
858
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
860
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
862
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
864
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
866
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
872OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint8x16, v16u8, v16i8, s8)
873OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int8x16, v16i8, v16i8, s8)
874OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint16x8, v8u16, v8i16, s16)
875OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int16x8, v8i16, v8i16, s16)
876OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint32x4, v4u32, v4i32, s32)
877OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int32x4, v4i32, v4i32, s32)
878OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float32x4, v4f32, v4i32, s32)
879OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint64x2, v2u64, v2i64, s64)
880OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int64x2, v2i64, v2i64, s64)
881OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float64x2, v2f64, v2i64, s64)
883
#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
884
inline _Tpvec v_load(const _Tp* ptr) \
885
{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
886
inline _Tpvec v_load_aligned(const _Tp* ptr) \
887
{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
888
inline _Tpvec v_load_low(const _Tp* ptr) \
889
{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
890
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
891
{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
892
inline void v_store(_Tp* ptr, const _Tpvec& a) \
893
{ msa_st1q_##suffix(ptr, a.val); } \
894
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
895
{ msa_st1q_##suffix(ptr, a.val); } \
896
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
897
{ msa_st1q_##suffix(ptr, a.val); } \
898
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode
) \
899
{ msa_st1q_##suffix(ptr, a.val); } \
900
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
902
int n = _Tpvec::nlanes; \
903
for( int i = 0; i < (n/2); i++ ) \
906
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
908
int n = _Tpvec::nlanes; \
909
for( int i = 0; i < (n/2); i++ ) \
910
ptr[i] = a.val[i+(n/2)]; \
913OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint8x16, uchar, u8)
914OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int8x16, schar, s8)
915OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint16x8, ushort, u16)
916OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int16x8,
short, s16)
917OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint32x4,
unsigned, u32)
918OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int32x4,
int, s32)
919OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint64x2, uint64, u64)
920OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int64x2, int64, s64)
921OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float32x4,
float, f32)
922OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float64x2,
double, f64)
928
v_uint8x16
c =
v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
933{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
937
v_uint16x8
c =
v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
942{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
955{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
958{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
969{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
972{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
975
#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
976
inline unsigned short v_reduce_##func(const v_uint16x8& a) \
979
ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
980
v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
982
ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
983
v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
984
return (unsigned short)cfunc(c[0], c[1]); \
990
#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
991
inline short v_reduce_##func(const v_int16x8& a) \
994
ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
995
v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
997
ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
998
v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
999
return (short)cfunc(c[0], c[1]); \
1005
#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
1006
inline scalartype v_reduce_##func(const _Tpvec& a) \
1008
return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
1019
#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
1020
inline scalartype v_reduce_##func(const _Tpvec& a) \
1023
v_expand(a, a1, a2); \
1024
return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
1034
#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
1035
inline scalartype v_reduce_sum(const _Tpvec& a) \
1037
return (scalartype)msa_sum_##suffix(a.val); \
1040OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint8x16,
unsigned
char, u8)
1041OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int8x16,
char, s8)
1042OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint16x8,
unsigned
short, u16)
1043OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int16x8,
short, s16)
1044OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint32x4,
unsigned, u32)
1045OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int32x4,
int, s32)
1046OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_float32x4,
float, f32)
1049{
return
(uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
1051{
return
(int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
1054
return
msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
1061
v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
1062
MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))));
1063
v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
1064
MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))));
1066
return
v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
1067
MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
1072
v16u8 t0 = msa_abdq_u8(a.val, b.val);
1073
v8u16 t1 = msa_paddlq_u8(t0);
1074
v4u32 t2 = msa_paddlq_u16(t1);
1075
return
msa_sum_u32(t2);
1079
v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
1080
v8u16 t1 = msa_paddlq_u8(t0);
1081
v4u32 t2 = msa_paddlq_u16(t1);
1082
return
msa_sum_u32(t2);
1086
v8u16 t0 = msa_abdq_u16(a.val, b.val);
1087
v4u32 t1 = msa_paddlq_u16(t0);
1088
return
msa_sum_u32(t1);
1092
v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
1093
v4u32 t1 = msa_paddlq_u16(t0);
1094
return
msa_sum_u32(t1);
1098
v4u32 t0 = msa_abdq_u32(a.val, b.val);
1099
return
msa_sum_u32(t0);
1103
v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
1104
return
msa_sum_u32(t0);
1108
v4f32 t0 = msa_abdq_f32(a.val, b.val);
1109
return
msa_sum_f32(t0);
1113
#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
1114
inline v_uint8x16 v_popcount(const _Tpvec& a) \
1116
v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
1117
return v_uint8x16(t); \
1119OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_uint8x16)
1120OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_int8x16)
1122
#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
1123
inline v_uint16x8 v_popcount(const _Tpvec& a) \
1125
v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
1126
return v_uint16x8(t); \
1128OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_uint16x8)
1129OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_int16x8)
1131
#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
1132
inline v_uint32x4 v_popcount(const _Tpvec& a) \
1134
v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
1135
return v_uint32x4(t); \
1137OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_uint32x4)
1138OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_int32x4)
1140
#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
1141
inline v_uint64x2 v_popcount(const _Tpvec& a) \
1143
v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
1144
return v_uint64x2(t); \
1146OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_uint64x2)
1147OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_int64x2)
1151
v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
1152
v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
1153
v8u16 v1 = msa_paddlq_u8(v0);
1154
v4u32 v2 = msa_paddlq_u16(v1);
1155
v2u64 v3 = msa_paddlq_u32(v2);
1156
return
(
int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
1159{
return
v_signmask(v_reinterpret_as_u8(a)); }
1163
v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
1164
v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
1165
v4u32 v1 = msa_paddlq_u16(v0);
1166
v2u64 v2 = msa_paddlq_u32(v1);
1167
return
(
int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
1170{
return
v_signmask(v_reinterpret_as_u16(a)); }
1174
v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
1175
v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
1176
v2u64 v1 = msa_paddlq_u32(v0);
1177
return
(
int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
1180{
return
v_signmask(v_reinterpret_as_u32(a)); }
1182{
return
v_signmask(v_reinterpret_as_u32(a)); }
1186
v2u64 v0 = msa_shrq_n_u64(a.val, 63);
1187
return
(
int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
1190{
return
v_signmask(v_reinterpret_as_u64(a)); }
1192{
return
v_signmask(v_reinterpret_as_u64(a)); }
1205
#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
1206
inline bool v_check_all(const v_##_Tpvec& a) \
1208
_Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
1209
v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1210
return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
1212
inline bool v_check_any(const v_##_Tpvec& a) \
1214
_Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
1215
v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1216
return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
1219OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
1220OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
1221OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
1222OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
1252
#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
1253
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1255
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
1256
MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
1259OPENCV_HAL_IMPL_MSA_SELECT(
v_uint8x16, v16u8, v16u8)
1260OPENCV_HAL_IMPL_MSA_SELECT(
v_int8x16, v16i8, v16u8)
1261OPENCV_HAL_IMPL_MSA_SELECT(
v_uint16x8, v8u16, v16u8)
1262OPENCV_HAL_IMPL_MSA_SELECT(
v_int16x8, v8i16, v16u8)
1263OPENCV_HAL_IMPL_MSA_SELECT(
v_uint32x4, v4u32, v16u8)
1264OPENCV_HAL_IMPL_MSA_SELECT(
v_int32x4, v4i32, v16u8)
1265OPENCV_HAL_IMPL_MSA_SELECT(
v_float32x4, v4f32, v16u8)
1266OPENCV_HAL_IMPL_MSA_SELECT(
v_float64x2, v2f64, v16u8)
1268
#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
1269
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1271
_Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1272
_Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1273
b0.val = msa_paddlq_##suffix(a_lo); \
1274
b1.val = msa_paddlq_##suffix(a_hi); \
1276
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1278
_Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1279
return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
1281
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1283
_Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1284
return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
1286
inline _Tpwvec v_load_expand(const _Tp* ptr) \
1288
return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
1300
return
v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
1305
return
v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
1309
#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1310
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1312
b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1313
b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1315
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1317
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1319
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1321
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1323
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1325
c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1326
d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1329OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint8x16, v16u8, v16i8, s8)
1330OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int8x16, v16i8, v16i8, s8)
1331OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint16x8, v8u16, v8i16, s16)
1332OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int16x8, v8i16, v8i16, s16)
1333OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint32x4, v4u32, v4i32, s32)
1334OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int32x4, v4i32, v4i32, s32)
1335OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float32x4, v4f32, v4i32, s32)
1336OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float64x2, v2f64, v2i64, s64)
1339
#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
1341
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1343
return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
1346OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint8x16, v16u8, v16i8, s8)
1347OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int8x16, v16i8, v16i8, s8)
1348OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint16x8, v8u16, v8i16, s16)
1349OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int16x8, v8i16, v8i16, s16)
1350OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint32x4, v4u32, v4i32, s32)
1351OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int32x4, v4i32, v4i32, s32)
1352OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint64x2, v2u64, v2i64, s64)
1353OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int64x2, v2i64, v2i64, s64)
1354OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float32x4, v4f32, v4i32, s32)
1355OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float64x2, v2f64, v2i64, s64)
1360
return
v_int32x4(msa_cvttintq_s32_f32(a.val));
1365
v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1366
return
v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
1371
v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1372
return
v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
1377
return
v_int32x4(msa_cvttruncq_s32_f32(a.val));
1382
return
v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
1387
return
v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
1392
v2f64 a1 = msa_cvtrintq_f64(a.val);
1393
return
v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
1398
v2f64 a1 = msa_cvtrintq_f64(a.val);
1399
return
v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
1404
return
v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
1407
#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1408
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1409
const _Tpvec& a2, const _Tpvec& a3, \
1410
_Tpvec& b0, _Tpvec& b1, \
1411
_Tpvec& b2, _Tpvec& b3) \
1413
_Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1414
_Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1415
_Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1416
_Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1417
b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1418
b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1419
b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1420
b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1423OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_uint32x4, v4u32, v4i32, s32)
1424OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_int32x4, v4i32, v4i32, s32)
1425OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_float32x4, v4f32, v4i32, s32)
1427
#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
1428
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1430
msa_ld2q_##suffix(ptr, &a.val, &b.val); \
1432
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1434
msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
1436
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1437
v_##_Tpvec& c, v_##_Tpvec& d) \
1439
msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
1441
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1442
hal::StoreMode
=hal::STORE_UNALIGNED) \
1444
msa_st2q_##suffix(ptr, a.val, b.val); \
1446
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1447
const v_##_Tpvec& c, hal::StoreMode
=hal::STORE_UNALIGNED) \
1449
msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
1451
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1452
const v_##_Tpvec& c, const v_##_Tpvec& d, \
1453
hal::StoreMode
=hal::STORE_UNALIGNED ) \
1455
msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
1458OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
1459OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
1460OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
1461OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8,
short, s16)
1462OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4,
unsigned, u32)
1463OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4,
int, s32)
1464OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4,
float, f32)
1465OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
1466OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
1467OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2,
double, f64)
1477
return
v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
1482
return
v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
1487
return
v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1492
return
v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1511
inline
v_int8x16
v_lut(
const
schar* tab,
const
int* idx)
1513
schar CV_DECL_ALIGNED(32) elems[16] =
1534
inline
v_int8x16
v_lut_pairs(
const
schar* tab,
const
int* idx)
1536
schar CV_DECL_ALIGNED(32) elems[16] =
1557
inline
v_int8x16
v_lut_quads(
const
schar* tab,
const
int* idx)
1559
schar CV_DECL_ALIGNED(32) elems[16] =
1580
inline
v_uint8x16
v_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1581
inline
v_uint8x16
v_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1582
inline
v_uint8x16
v_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1585
inline
v_int16x8
v_lut(
const
short* tab,
const
int* idx)
1587
short
CV_DECL_ALIGNED(32) elems[8] =
1600
inline
v_int16x8
v_lut_pairs(
const
short* tab,
const
int* idx)
1602
short
CV_DECL_ALIGNED(32) elems[8] =
1615
inline
v_int16x8
v_lut_quads(
const
short* tab,
const
int* idx)
1617
return
v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
1619
inline
v_uint16x8
v_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut((
short*)tab, idx)); }
1620
inline
v_uint16x8
v_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_pairs((
short*)tab, idx)); }
1621
inline
v_uint16x8
v_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_quads((
short*)tab, idx)); }
1623
inline
v_int32x4
v_lut(
const
int* tab,
const
int* idx)
1625
int
CV_DECL_ALIGNED(32) elems[4] =
1634
inline
v_int32x4
v_lut_pairs(
const
int* tab,
const
int* idx)
1636
return
v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
1638
inline
v_int32x4
v_lut_quads(
const
int* tab,
const
int* idx)
1640
return
v_int32x4(msa_ld1q_s32(tab + idx[0]));
1642
inline
v_uint32x4
v_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut((
int*)tab, idx)); }
1643
inline
v_uint32x4
v_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_pairs((
int*)tab, idx)); }
1644
inline
v_uint32x4
v_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_quads((
int*)tab, idx)); }
1646
inline
v_int64x2
v_lut(
const
int64_t* tab,
const
int* idx)
1648
return
v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
1650
inline
v_int64x2
v_lut_pairs(
const
int64_t* tab,
const
int* idx)
1652
return
v_int64x2(msa_ld1q_s64(tab + idx[0]));
1654
inline
v_uint64x2
v_lut(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut((
const
int64_t *)tab, idx)); }
1655
inline
v_uint64x2
v_lut_pairs(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut_pairs((
const
int64_t *)tab, idx)); }
1657
inline
v_float32x4
v_lut(
const
float* tab,
const
int* idx)
1659
float
CV_DECL_ALIGNED(32) elems[4] =
1668
inline
v_float32x4
v_lut_pairs(
const
float* tab,
const
int* idx)
1670
uint64 CV_DECL_ALIGNED(32) elems[2] =
1672
*(uint64*)(tab + idx[0]),
1673
*(uint64*)(tab + idx[1])
1675
return
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
1677
inline
v_float32x4
v_lut_quads(
const
float* tab,
const
int* idx)
1684
int
CV_DECL_ALIGNED(32) idx[4];
1687
return
v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1692
unsigned
CV_DECL_ALIGNED(32) elems[4] =
1694
tab[msa_getq_lane_s32(idxvec.val, 0)],
1695
tab[msa_getq_lane_s32(idxvec.val, 1)],
1696
tab[msa_getq_lane_s32(idxvec.val, 2)],
1697
tab[msa_getq_lane_s32(idxvec.val, 3)]
1704
int
CV_DECL_ALIGNED(32) idx[4];
1707
return
v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1712
int
CV_DECL_ALIGNED(32) idx[4];
1715
v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
1716
v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
1717
x =
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1718
y =
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1723
v_int8x16
c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
1727{
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1730
v_int8x16
c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
1733
inline
v_uint8x16
v_interleave_quads(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1737
v_int16x8
c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
1741
inline
v_uint16x8
v_interleave_pairs(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1745
v_int16x8
c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
1749
inline
v_uint16x8
v_interleave_quads(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1754
c.val[0] = vec.val[0];
1755
c.val[1] = vec.val[2];
1756
c.val[2] = vec.val[1];
1757
c.val[3] = vec.val[3];
1761
inline
v_uint32x4
v_interleave_pairs(
const
v_uint32x4& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1762
inline
v_float32x4
v_interleave_pairs(
const
v_float32x4& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1766
v_int8x16
c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
1770
inline
v_uint8x16
v_pack_triplets(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1774
v_int16x8
c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
1778
inline
v_uint16x8
v_pack_triplets(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1783
inline
v_float64x2
v_lut(
const
double* tab,
const
int* idx)
1785
double
CV_DECL_ALIGNED(32) elems[2] =
1793
inline
v_float64x2
v_lut_pairs(
const
double* tab,
const
int* idx)
1800
int
CV_DECL_ALIGNED(32) idx[4];
1808
int
CV_DECL_ALIGNED(32) idx[4];
1811
v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
1812
v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
1813
x =
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1814
y =
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1817template<
int
i, typename _Tp>
1818inline typename _Tp::lane_type
v_extract_n(const _Tp& a)
1820
return
v_rotate_right<i>(a).get0();
1826
return
v_setall_u32(v_extract_n<i>(a));
1831
return
v_setall_s32(v_extract_n<i>(a));
1836
return
v_setall_f32(v_extract_n<i>(a));
1844
v4f16 v = (v4f16)msa_ld1_s16((
const
short*)ptr);
1846
v4f16 v = msa_ld1_f16((
const
__fp16*)ptr);
1851
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
1853
v4f16 hv = msa_cvt_f16_f32(v.val);
1856
msa_st1_s16((
short*)ptr, (int16x4_t)hv);
1858
msa_st1_f16((__fp16*)ptr, hv);
1865
for(
int
i = 0; i < 4; i++ )
1866
buf[i] = (
float)ptr[i];
1870
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
1874
for(
int
i = 0; i < 4; i++ )
1875
ptr[i] = (float16_t)buf[i];
1879
inline
void
v_cleanup() {}
1881CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition:
intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition:
intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition:
intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition:
intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition:
intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition:
intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition:
intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition:
intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition:
intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition:
intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition:
intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition:
intrin_cpp.hpp:2416
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition:
intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition:
intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition:
intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition:
intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition:
intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition:
intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition:
intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition:
intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition:
intrin_cpp.hpp:502
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75