OpenCV 4.5.3(日本語機械翻訳)
intrin_msa.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_HAL_INTRIN_MSA_HPP
6 #define OPENCV_HAL_INTRIN_MSA_HPP
7
8 #include <algorithm>
9 #include "opencv2/core/utility.hpp"
10
11 namespace cv
12{
13
15CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
16
17 #define CV_SIMD128 1
18
19 //MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
20 //MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
21 #define CV_SIMD128_64F 1
22
23 struct v_uint8x16
24{
25 typedef uchar lane_type;
26 enum { nlanes = 16 };
27
28 v_uint8x16() {}
29 explicit v_uint8x16(v16u8 v) : val(v) {}
30 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
31 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
32 {
33 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
34 val = msa_ld1q_u8(v);
35 }
36
37 uchar get0() const
38 {
39 return msa_getq_lane_u8(val, 0);
40 }
41
42 v16u8 val;
43};
44
45 struct v_int8x16
46{
47 typedef schar lane_type;
48 enum { nlanes = 16 };
49
50 v_int8x16() {}
51 explicit v_int8x16(v16i8 v) : val(v) {}
52 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54 {
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = msa_ld1q_s8(v);
57 }
58
59 schar get0() const
60 {
61 return msa_getq_lane_s8(val, 0);
62 }
63
64 v16i8 val;
65};
66
67 struct v_uint16x8
68{
69 typedef ushort lane_type;
70 enum { nlanes = 8 };
71
72 v_uint16x8() {}
73 explicit v_uint16x8(v8u16 v) : val(v) {}
74 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
75 {
76 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
77 val = msa_ld1q_u16(v);
78 }
79
80 ushort get0() const
81 {
82 return msa_getq_lane_u16(val, 0);
83 }
84
85 v8u16 val;
86};
87
88 struct v_int16x8
89{
90 typedef short lane_type;
91 enum { nlanes = 8 };
92
93 v_int16x8() {}
94 explicit v_int16x8(v8i16 v) : val(v) {}
95 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
96 {
97 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
98 val = msa_ld1q_s16(v);
99 }
100
101 short get0() const
102 {
103 return msa_getq_lane_s16(val, 0);
104 }
105
106 v8i16 val;
107};
108
109 struct v_uint32x4
110{
111 typedef unsigned int lane_type;
112 enum { nlanes = 4 };
113
114 v_uint32x4() {}
115 explicit v_uint32x4(v4u32 v) : val(v) {}
116 v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
117 {
118 unsigned int v[] = {v0, v1, v2, v3};
119 val = msa_ld1q_u32(v);
120 }
121
122 unsigned int get0() const
123 {
124 return msa_getq_lane_u32(val, 0);
125 }
126
127 v4u32 val;
128};
129
130 struct v_int32x4
131{
132 typedef int lane_type;
133 enum { nlanes = 4 };
134
135 v_int32x4() {}
136 explicit v_int32x4(v4i32 v) : val(v) {}
137 v_int32x4(int v0, int v1, int v2, int v3)
138 {
139 int v[] = {v0, v1, v2, v3};
140 val = msa_ld1q_s32(v);
141 }
142
143 int get0() const
144 {
145 return msa_getq_lane_s32(val, 0);
146 }
147
148 v4i32 val;
149};
150
151 struct v_float32x4
152{
153 typedef float lane_type;
154 enum { nlanes = 4 };
155
156 v_float32x4() {}
157 explicit v_float32x4(v4f32 v) : val(v) {}
158 v_float32x4(float v0, float v1, float v2, float v3)
159 {
160 float v[] = {v0, v1, v2, v3};
161 val = msa_ld1q_f32(v);
162 }
163
164 float get0() const
165 {
166 return msa_getq_lane_f32(val, 0);
167 }
168
169 v4f32 val;
170};
171
172 struct v_uint64x2
173{
174 typedef uint64 lane_type;
175 enum { nlanes = 2 };
176
177 v_uint64x2() {}
178 explicit v_uint64x2(v2u64 v) : val(v) {}
179 v_uint64x2(uint64 v0, uint64 v1)
180 {
181 uint64 v[] = {v0, v1};
182 val = msa_ld1q_u64(v);
183 }
184
185 uint64 get0() const
186 {
187 return msa_getq_lane_u64(val, 0);
188 }
189
190 v2u64 val;
191};
192
193 struct v_int64x2
194{
195 typedef int64 lane_type;
196 enum { nlanes = 2 };
197
198 v_int64x2() {}
199 explicit v_int64x2(v2i64 v) : val(v) {}
200 v_int64x2(int64 v0, int64 v1)
201 {
202 int64 v[] = {v0, v1};
203 val = msa_ld1q_s64(v);
204 }
205
206 int64 get0() const
207 {
208 return msa_getq_lane_s64(val, 0);
209 }
210
211 v2i64 val;
212};
213
214 struct v_float64x2
215{
216 typedef double lane_type;
217 enum { nlanes = 2 };
218
219 v_float64x2() {}
220 explicit v_float64x2(v2f64 v) : val(v) {}
221 v_float64x2(double v0, double v1)
222 {
223 double v[] = {v0, v1};
224 val = msa_ld1q_f64(v);
225 }
226
227 double get0() const
228 {
229 return msa_getq_lane_f64(val, 0);
230 }
231
232 v2f64 val;
233};
234
235 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
236 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
237 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
238 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
239 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
240 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
241 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
242 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
243 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
244 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
245 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
246 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
247 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
248
249OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
250OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
251OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
252OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
253OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
254OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
255OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
256OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
257OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
258OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
259
260 #define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
261 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
262 { \
263 return _Tpvec(mov(a.val, b.val)); \
264 } \
265 template<int n> inline \
266 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
267 { \
268 return _Tpvec(rshr(a.val, b.val, n)); \
269 }
270
271OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
272OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
273OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
274OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
275OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
276OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
277OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
278OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
279
280 #define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
281 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
282 { \
283 hreg a1 = mov(a.val); \
284 msa_st1_##suffix(ptr, a1); \
285 } \
286 template<int n> inline \
287 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
288 { \
289 hreg a1 = rshr(a.val, n); \
290 msa_st1_##suffix(ptr, a1); \
291 }
292
293OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
294OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
295OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
296OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
297OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
298OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
299OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
300OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
301
302 // pack boolean
303 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
304{
305 return v_uint8x16(msa_pack_u16(a.val, b.val));
306}
307
308 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
309 const v_uint32x4& c, const v_uint32x4& d)
310{
311 return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
312}
313
314 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
315 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
316 const v_uint64x2& g, const v_uint64x2& h)
317{
318 v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
319 v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
320 return v_uint8x16(msa_pack_u16(abcd, efgh));
321}
322
323 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
324 const v_float32x4& m1, const v_float32x4& m2,
325 const v_float32x4& m3)
326{
327 v4f32 v0 = v.val;
328 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
329 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
330 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
331 res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
332 return v_float32x4(res);
333}
334
335 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
336 const v_float32x4& m1, const v_float32x4& m2,
337 const v_float32x4& a)
338{
339 v4f32 v0 = v.val;
340 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
341 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
342 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
343 res = msa_addq_f32(res, a.val);
344 return v_float32x4(res);
345}
346
347 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
348 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
349 { \
350 return _Tpvec(intrin(a.val, b.val)); \
351 } \
352 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
353 { \
354 a.val = intrin(a.val, b.val); \
355 return a; \
356 }
357
358OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
359OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
360OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
361OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
362OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
363OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
364OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
365OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
366OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
367OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
368OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
369OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
370OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
371OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
372OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
373OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
374OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
375OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
376OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
377OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
378OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
379OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
380OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
381OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
382OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
383OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
384
385 // saturating multiply 8-bit, 16-bit
386 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
387 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
388 { \
389 _Tpwvec c, d; \
390 v_mul_expand(a, b, c, d); \
391 return v_pack(c, d); \
392 } \
393 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
394 {a = a * b; return a; }
395
396OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
397OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
398OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4)
399OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
400
401 // Multiply and expand
402 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
403 v_int16x8& c, v_int16x8& d)
404{
405 v16i8 a_lo, a_hi, b_lo, b_hi;
406
407 ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
408 ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
409 c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
410 d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
411}
412
413 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
414 v_uint16x8& c, v_uint16x8& d)
415{
416 v16u8 a_lo, a_hi, b_lo, b_hi;
417
418 ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
419 ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
420 c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
421 d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
422}
423
424 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
425 v_int32x4& c, v_int32x4& d)
426{
427 v8i16 a_lo, a_hi, b_lo, b_hi;
428
429 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
430 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
431 c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
432 d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
433}
434
435 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
436 v_uint32x4& c, v_uint32x4& d)
437{
438 v8u16 a_lo, a_hi, b_lo, b_hi;
439
440 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
441 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
442 c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
443 d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
444}
445
446 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
447 v_uint64x2& c, v_uint64x2& d)
448{
449 v4u32 a_lo, a_hi, b_lo, b_hi;
450
451 ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
452 ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
453 c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
454 d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
455}
456
457 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
458{
459 v8i16 a_lo, a_hi, b_lo, b_hi;
460
461 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
462 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
463
464 return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
465 msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
466}
467
468 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
469{
470 v8u16 a_lo, a_hi, b_lo, b_hi;
471
472 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
473 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
474
475 return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
476 msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
477}
478
480
481 // 16 >> 32
482 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
483{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
484 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
485{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
486
487 // 32 >> 64
488 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
489{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
490 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
491{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
492
493 // 8 >> 32
494 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
495{
496 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
497 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
498 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
499 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
500 v4u32 prod = msa_dotp_u_w(even_a, even_b);
501 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
502}
503 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
504{
505 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
506 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
507 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
508 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
509 v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
510 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
511}
512
513 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
514{
515 v8i16 prod = msa_dotp_s_h(a.val, b.val);
516 return v_int32x4(msa_hadd_s32(prod, prod));
517}
518 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
519 const v_int32x4& c)
520{ return v_dotprod_expand(a, b) + c; }
521
522 // 16 >> 64
523 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
524{
525 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
526 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
527 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
528 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
529 v2u64 prod = msa_dotp_u_d(even_a, even_b);
530 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
531}
532 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
533 const v_uint64x2& c)
534{
535 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
536 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
537 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
538 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
539 v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
540 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
541}
542
543 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
544{
545 v4i32 prod = msa_dotp_s_w(a.val, b.val);
546 return v_int64x2(msa_hadd_s64(prod, prod));
547}
548 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
549{ return v_dotprod_expand(a, b) + c; }
550
551 // 32 >> 64f
552 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
553{ return v_cvt_f64(v_dotprod(a, b)); }
554 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
555{ return v_dotprod_expand(a, b) + c; }
556
557
559
560 // 16 >> 32
561 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
562{ return v_dotprod(a, b); }
563 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
564{ return v_dotprod(a, b, c); }
565
566 // 32 >> 64
567 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
568{ return v_dotprod(a, b); }
569 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
570{ return v_dotprod(a, b, c); }
571
572 // 8 >> 32
573 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
574{ return v_dotprod_expand(a, b); }
575 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
576{ return v_dotprod_expand(a, b, c); }
577 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
578{ return v_dotprod_expand(a, b); }
579 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
580{ return v_dotprod_expand(a, b, c); }
581
582 // 16 >> 64
583 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
584{ return v_dotprod_expand(a, b); }
585 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
586{ return v_dotprod_expand(a, b, c); }
587 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
588{ return v_dotprod_expand(a, b); }
589 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
590{ return v_dotprod_expand(a, b, c); }
591
592 // 32 >> 64f
593 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
594{ return v_dotprod_expand(a, b); }
595 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
596{ return v_dotprod_expand(a, b, c); }
597
598 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
599 OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
600 OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
601 OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
602 inline _Tpvec operator ~ (const _Tpvec& a) \
603 { \
604 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
605 }
606
607OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
608OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
609OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
610OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
611OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
612OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
613OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
614OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
615
616 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
617 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
618 { \
619 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
620 } \
621 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
622 { \
623 a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
624 return a; \
625 }
626
627OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
628OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
629OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
630
631 inline v_float32x4 operator ~ (const v_float32x4& a)
632{
633 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
634}
635
636 /* v_abs */
637 #define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
638 inline _Tpuvec v_abs(const _Tpsvec& a) \
639 { \
640 return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
641 }
642
643OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
644OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
645OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
646
647 /* v_abs(float), v_sqrt, v_invsqrt */
648 #define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
649 inline _Tpvec func(const _Tpvec& a) \
650 { \
651 return _Tpvec(intrin(a.val)); \
652 }
653
654OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
655OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
656OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
657OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
658OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
659OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
660
661 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
662 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
663 { \
664 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
665 } \
666 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
667 { \
668 a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
669 return a; \
670 }
671
672OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
673OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
674OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
675
676 inline v_float64x2 operator ~ (const v_float64x2& a)
677{
678 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
679}
680
681 // TODO: exp, log, sin, cos
682
683 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
684 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
685 { \
686 return _Tpvec(intrin(a.val, b.val)); \
687 }
688
689OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
690OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
691OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
692OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
693OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
694OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
695OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
696OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
697OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
698OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
699OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
700OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
701OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
702OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
703OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
704OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
705
706 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
707 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
708 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
709 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
710 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
711 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
712 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
713 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
714 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
715 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
716 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
717 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
718 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
719
720OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
721OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
722OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
723OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
724OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
725OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
726OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
727OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
728OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
729OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
730
731 inline v_float32x4 v_not_nan(const v_float32x4& a)
732{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
733 inline v_float64x2 v_not_nan(const v_float64x2& a)
734{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
735
736OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
737OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
738OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
739OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
740OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
741OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
742OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
743OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
744OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
745OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
746OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
747OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
748
749OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
750OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
751OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
752OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
753OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
754
755
756OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
757OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
758
759 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
760 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
761 { \
762 return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
763 }
764
765OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
766OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
767OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
768
769 /* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
770 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
771{
772 v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
773 return v_sqrt(x);
774}
775
776 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
777{
778 return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
779}
780
781 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
782{
783 return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
784}
785
786 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
787{
788 return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
789}
790
791 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
792{
793 return v_fma(a, b, c);
794}
795
796 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
797{
798 return v_fma(a, b, c);
799}
800
801 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
802{
803 v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
804 return v_sqrt(x);
805}
806
807 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
808{
809 return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
810}
811
812 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
813{
814 return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
815}
816
817 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
818{
819 return v_fma(a, b, c);
820}
821
822 // trade efficiency for convenience
823 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
824 inline _Tpvec operator << (const _Tpvec& a, int n) \
825 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
826 inline _Tpvec operator >> (const _Tpvec& a, int n) \
827 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
828 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
829 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
830 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
831 { return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
832 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
833 { return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
834
835OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
836OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
837OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
838OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
839OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
840OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
841OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
842OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
843
844 /* v_rotate_right, v_rotate_left */
845 #define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
846 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
847 { \
848 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
849 } \
850 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
851 { \
852 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
853 } \
854 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
855 { \
856 return a; \
857 } \
858 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
859 { \
860 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
861 } \
862 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
863 { \
864 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
865 } \
866 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
867 { \
868 CV_UNUSED(b); \
869 return a; \
870 }
871
872OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
873OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
874OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
875OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
876OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
877OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
878OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
879OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
880OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
881OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
882
883 #define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
884 inline _Tpvec v_load(const _Tp* ptr) \
885 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
886 inline _Tpvec v_load_aligned(const _Tp* ptr) \
887 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
888 inline _Tpvec v_load_low(const _Tp* ptr) \
889 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
890 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
891 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
892 inline void v_store(_Tp* ptr, const _Tpvec& a) \
893 { msa_st1q_##suffix(ptr, a.val); } \
894 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
895 { msa_st1q_##suffix(ptr, a.val); } \
896 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
897 { msa_st1q_##suffix(ptr, a.val); } \
898 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/ ) \
899 { msa_st1q_##suffix(ptr, a.val); } \
900 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
901 { \
902 int n = _Tpvec::nlanes; \
903 for( int i = 0; i < (n/2); i++ ) \
904 ptr[i] = a.val[i]; \
905 } \
906 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
907 { \
908 int n = _Tpvec::nlanes; \
909 for( int i = 0; i < (n/2); i++ ) \
910 ptr[i] = a.val[i+(n/2)]; \
911 }
912
913OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
914OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
915OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
916OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
917OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
918OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
919OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
920OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
921OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
922OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
923
924
925
926 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
927{
928 v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
929 return c;
930}
931
932 inline v_int8x16 v_reverse(const v_int8x16 &a)
933{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
934
935 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
936{
937 v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
938 return c;
939}
940
941 inline v_int16x8 v_reverse(const v_int16x8 &a)
942{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
943
944 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
945{
946 v_uint32x4 c;
947 c.val[0] = a.val[3];
948 c.val[1] = a.val[2];
949 c.val[2] = a.val[1];
950 c.val[3] = a.val[0];
951 return c;
952}
953
954 inline v_int32x4 v_reverse(const v_int32x4 &a)
955{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
956
957 inline v_float32x4 v_reverse(const v_float32x4 &a)
958{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
959
960 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
961{
962 v_uint64x2 c;
963 c.val[0] = a.val[1];
964 c.val[1] = a.val[0];
965 return c;
966}
967
968 inline v_int64x2 v_reverse(const v_int64x2 &a)
969{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
970
971 inline v_float64x2 v_reverse(const v_float64x2 &a)
972{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
973
974
975 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
976 inline unsigned short v_reduce_##func(const v_uint16x8& a) \
977 { \
978 v8u16 a_lo, a_hi; \
979 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
980 v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
981 v4u32 b_lo, b_hi; \
982 ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
983 v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
984 return (unsigned short)cfunc(c[0], c[1]); \
985 }
986
987OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
988OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
989
990 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
991 inline short v_reduce_##func(const v_int16x8& a) \
992 { \
993 v8i16 a_lo, a_hi; \
994 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
995 v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
996 v4i32 b_lo, b_hi; \
997 ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
998 v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
999 return (short)cfunc(c[0], c[1]); \
1000 }
1001
1002OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
1003OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
1004
1005 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
1006 inline scalartype v_reduce_##func(const _Tpvec& a) \
1007 { \
1008 return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
1009 }
1010
1011OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1012OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1013OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
1014OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
1015OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
1016OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
1017
1018
1019 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
1020 inline scalartype v_reduce_##func(const _Tpvec& a) \
1021 { \
1022 _Tpvec2 a1, a2; \
1023 v_expand(a, a1, a2); \
1024 return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
1025 }
1026
1027OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
1028OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
1029OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
1030OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
1031
1032
1033
1034 #define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
1035 inline scalartype v_reduce_sum(const _Tpvec& a) \
1036 { \
1037 return (scalartype)msa_sum_##suffix(a.val); \
1038 }
1039
1040OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8)
1041OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8)
1042OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16)
1043OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16)
1044OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32)
1045OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32)
1046OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
1047
1048 inline uint64 v_reduce_sum(const v_uint64x2& a)
1049{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
1050 inline int64 v_reduce_sum(const v_int64x2& a)
1051{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
1052 inline double v_reduce_sum(const v_float64x2& a)
1053{
1054 return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
1055}
1056
1057 /* v_reduce_sum4, v_reduce_sad */
1058 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1059 const v_float32x4& c, const v_float32x4& d)
1060{
1061 v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
1062 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
1063 v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
1064 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
1065
1066 return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
1067 MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
1068}
1069
1070 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1071{
1072 v16u8 t0 = msa_abdq_u8(a.val, b.val);
1073 v8u16 t1 = msa_paddlq_u8(t0);
1074 v4u32 t2 = msa_paddlq_u16(t1);
1075 return msa_sum_u32(t2);
1076}
1077 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1078{
1079 v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
1080 v8u16 t1 = msa_paddlq_u8(t0);
1081 v4u32 t2 = msa_paddlq_u16(t1);
1082 return msa_sum_u32(t2);
1083}
1084 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1085{
1086 v8u16 t0 = msa_abdq_u16(a.val, b.val);
1087 v4u32 t1 = msa_paddlq_u16(t0);
1088 return msa_sum_u32(t1);
1089}
1090 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1091{
1092 v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
1093 v4u32 t1 = msa_paddlq_u16(t0);
1094 return msa_sum_u32(t1);
1095}
1096 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1097{
1098 v4u32 t0 = msa_abdq_u32(a.val, b.val);
1099 return msa_sum_u32(t0);
1100}
1101 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1102{
1103 v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
1104 return msa_sum_u32(t0);
1105}
1106 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1107{
1108 v4f32 t0 = msa_abdq_f32(a.val, b.val);
1109 return msa_sum_f32(t0);
1110}
1111
1112 /* v_popcount */
1113 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
1114 inline v_uint8x16 v_popcount(const _Tpvec& a) \
1115 { \
1116 v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
1117 return v_uint8x16(t); \
1118 }
1119OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
1120OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
1121
1122 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
1123 inline v_uint16x8 v_popcount(const _Tpvec& a) \
1124 { \
1125 v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
1126 return v_uint16x8(t); \
1127 }
1128OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
1129OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
1130
1131 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
1132 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1133 { \
1134 v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
1135 return v_uint32x4(t); \
1136 }
1137OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
1138OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
1139
1140 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
1141 inline v_uint64x2 v_popcount(const _Tpvec& a) \
1142 { \
1143 v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
1144 return v_uint64x2(t); \
1145 }
1146OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
1147OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
1148
1149 inline int v_signmask(const v_uint8x16& a)
1150{
1151 v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
1152 v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
1153 v8u16 v1 = msa_paddlq_u8(v0);
1154 v4u32 v2 = msa_paddlq_u16(v1);
1155 v2u64 v3 = msa_paddlq_u32(v2);
1156 return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
1157}
1158 inline int v_signmask(const v_int8x16& a)
1159{ return v_signmask(v_reinterpret_as_u8(a)); }
1160
1161 inline int v_signmask(const v_uint16x8& a)
1162{
1163 v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
1164 v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
1165 v4u32 v1 = msa_paddlq_u16(v0);
1166 v2u64 v2 = msa_paddlq_u32(v1);
1167 return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
1168}
1169 inline int v_signmask(const v_int16x8& a)
1170{ return v_signmask(v_reinterpret_as_u16(a)); }
1171
1172 inline int v_signmask(const v_uint32x4& a)
1173{
1174 v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
1175 v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
1176 v2u64 v1 = msa_paddlq_u32(v0);
1177 return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
1178}
1179 inline int v_signmask(const v_int32x4& a)
1180{ return v_signmask(v_reinterpret_as_u32(a)); }
1181 inline int v_signmask(const v_float32x4& a)
1182{ return v_signmask(v_reinterpret_as_u32(a)); }
1183
1184 inline int v_signmask(const v_uint64x2& a)
1185{
1186 v2u64 v0 = msa_shrq_n_u64(a.val, 63);
1187 return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
1188}
1189 inline int v_signmask(const v_int64x2& a)
1190{ return v_signmask(v_reinterpret_as_u64(a)); }
1191 inline int v_signmask(const v_float64x2& a)
1192{ return v_signmask(v_reinterpret_as_u64(a)); }
1193
1194 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1195 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1196 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1197 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1198 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1199 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1200 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1201 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1202 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1203 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1204
1205 #define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
1206 inline bool v_check_all(const v_##_Tpvec& a) \
1207 { \
1208 _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
1209 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1210 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
1211 } \
1212 inline bool v_check_any(const v_##_Tpvec& a) \
1213 { \
1214 _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
1215 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1216 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
1217 }
1218
1219OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
1220OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
1221OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
1222OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
1223
1224inline bool v_check_all(const v_int8x16& a)
1225{ return v_check_all(v_reinterpret_as_u8(a)); }
1226 inline bool v_check_all(const v_int16x8& a)
1227{ return v_check_all(v_reinterpret_as_u16(a)); }
1228 inline bool v_check_all(const v_int32x4& a)
1229{ return v_check_all(v_reinterpret_as_u32(a)); }
1230 inline bool v_check_all(const v_float32x4& a)
1231{ return v_check_all(v_reinterpret_as_u32(a)); }
1232
1233 inline bool v_check_any(const v_int8x16& a)
1234{ return v_check_any(v_reinterpret_as_u8(a)); }
1235 inline bool v_check_any(const v_int16x8& a)
1236{ return v_check_any(v_reinterpret_as_u16(a)); }
1237 inline bool v_check_any(const v_int32x4& a)
1238{ return v_check_any(v_reinterpret_as_u32(a)); }
1239 inline bool v_check_any(const v_float32x4& a)
1240{ return v_check_any(v_reinterpret_as_u32(a)); }
1241
1242 inline bool v_check_all(const v_int64x2& a)
1243{ return v_check_all(v_reinterpret_as_u64(a)); }
1244 inline bool v_check_all(const v_float64x2& a)
1245{ return v_check_all(v_reinterpret_as_u64(a)); }
1246 inline bool v_check_any(const v_int64x2& a)
1247{ return v_check_any(v_reinterpret_as_u64(a)); }
1248 inline bool v_check_any(const v_float64x2& a)
1249{ return v_check_any(v_reinterpret_as_u64(a)); }
1250
1251 /* v_select */
1252 #define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
1253 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1254 { \
1255 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
1256 MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
1257 }
1258
1259OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
1260OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
1261OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
1262OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
1263OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
1264OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
1265OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
1266OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
1267
1268 #define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
1269 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1270 { \
1271 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1272 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1273 b0.val = msa_paddlq_##suffix(a_lo); \
1274 b1.val = msa_paddlq_##suffix(a_hi); \
1275 } \
1276 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1277 { \
1278 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1279 return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
1280 } \
1281 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1282 { \
1283 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1284 return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
1285 } \
1286 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1287 { \
1288 return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
1289 }
1290
1291OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
1292OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
1293OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
1294OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
1295OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
1296OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
1297
1298 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1299{
1300 return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
1301}
1302
1303 inline v_int32x4 v_load_expand_q(const schar* ptr)
1304{
1305 return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
1306}
1307
1308 /* v_zip, v_combine_low, v_combine_high, v_recombine */
1309 #define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1310 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1311 { \
1312 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1313 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1314 } \
1315 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1316 { \
1317 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1318 } \
1319 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1320 { \
1321 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1322 } \
1323 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1324 { \
1325 c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1326 d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1327 }
1328
1329OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
1330OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
1331OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
1332OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
1333OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
1334OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
1335OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
1336OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
1337
1338 /* v_extract */
1339 #define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
1340 template <int s> \
1341 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1342 { \
1343 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
1344 }
1345
1346OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
1347OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
1348OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
1349OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
1350OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
1351OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
1352OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
1353OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
1354OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
1355OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
1356
1357 /* v_round, v_floor, v_ceil, v_trunc */
1358 inline v_int32x4 v_round(const v_float32x4& a)
1359{
1360 return v_int32x4(msa_cvttintq_s32_f32(a.val));
1361}
1362
1363 inline v_int32x4 v_floor(const v_float32x4& a)
1364{
1365 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1366 return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
1367}
1368
1369 inline v_int32x4 v_ceil(const v_float32x4& a)
1370{
1371 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1372 return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
1373}
1374
1375 inline v_int32x4 v_trunc(const v_float32x4& a)
1376{
1377 return v_int32x4(msa_cvttruncq_s32_f32(a.val));
1378}
1379
1380 inline v_int32x4 v_round(const v_float64x2& a)
1381{
1382 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
1383}
1384
1385 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1386{
1387 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
1388}
1389
1390 inline v_int32x4 v_floor(const v_float64x2& a)
1391{
1392 v2f64 a1 = msa_cvtrintq_f64(a.val);
1393 return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
1394}
1395
1396 inline v_int32x4 v_ceil(const v_float64x2& a)
1397{
1398 v2f64 a1 = msa_cvtrintq_f64(a.val);
1399 return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
1400}
1401
1402 inline v_int32x4 v_trunc(const v_float64x2& a)
1403{
1404 return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
1405}
1406
1407 #define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1408 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1409 const _Tpvec& a2, const _Tpvec& a3, \
1410 _Tpvec& b0, _Tpvec& b1, \
1411 _Tpvec& b2, _Tpvec& b3) \
1412 { \
1413 _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1414 _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1415 _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1416 _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1417 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1418 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1419 b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1420 b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1421 }
1422
1423OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
1424OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
1425OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
1426
1427 #define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
1428 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1429 { \
1430 msa_ld2q_##suffix(ptr, &a.val, &b.val); \
1431 } \
1432 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1433 { \
1434 msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
1435 } \
1436 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1437 v_##_Tpvec& c, v_##_Tpvec& d) \
1438 { \
1439 msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
1440 } \
1441 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1442 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
1443 { \
1444 msa_st2q_##suffix(ptr, a.val, b.val); \
1445 } \
1446 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1447 const v_##_Tpvec& c, hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
1448 { \
1449 msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
1450 } \
1451 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1452 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1453 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED ) \
1454 { \
1455 msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
1456 }
1457
1458OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
1459OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
1460OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
1461OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
1462OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
1463OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
1464OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
1465OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
1466OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
1467OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
1468
1469 /* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
1470 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1471{
1472 return v_float32x4(msa_cvtfintq_f32_s32(a.val));
1473}
1474
1475 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1476{
1477 return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
1478}
1479
1480 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1481{
1482 return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
1483}
1484
1485 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1486{
1487 return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1488}
1489
1490 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1491{
1492 return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1493}
1494
1495 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1496{
1497 return v_float64x2(msa_cvtflq_f64_f32(a.val));
1498}
1499
1500 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1501{
1502 return v_float64x2(msa_cvtfhq_f64_f32(a.val));
1503}
1504
1505 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1506{
1507 return v_float64x2(msa_cvtfintq_f64_s64(a.val));
1508}
1509
1511 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1512{
1513 schar CV_DECL_ALIGNED(32) elems[16] =
1514 {
1515 tab[idx[ 0]],
1516 tab[idx[ 1]],
1517 tab[idx[ 2]],
1518 tab[idx[ 3]],
1519 tab[idx[ 4]],
1520 tab[idx[ 5]],
1521 tab[idx[ 6]],
1522 tab[idx[ 7]],
1523 tab[idx[ 8]],
1524 tab[idx[ 9]],
1525 tab[idx[10]],
1526 tab[idx[11]],
1527 tab[idx[12]],
1528 tab[idx[13]],
1529 tab[idx[14]],
1530 tab[idx[15]]
1531 };
1532 return v_int8x16(msa_ld1q_s8(elems));
1533}
1534 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1535{
1536 schar CV_DECL_ALIGNED(32) elems[16] =
1537 {
1538 tab[idx[0]],
1539 tab[idx[0] + 1],
1540 tab[idx[1]],
1541 tab[idx[1] + 1],
1542 tab[idx[2]],
1543 tab[idx[2] + 1],
1544 tab[idx[3]],
1545 tab[idx[3] + 1],
1546 tab[idx[4]],
1547 tab[idx[4] + 1],
1548 tab[idx[5]],
1549 tab[idx[5] + 1],
1550 tab[idx[6]],
1551 tab[idx[6] + 1],
1552 tab[idx[7]],
1553 tab[idx[7] + 1]
1554 };
1555 return v_int8x16(msa_ld1q_s8(elems));
1556}
1557 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1558{
1559 schar CV_DECL_ALIGNED(32) elems[16] =
1560 {
1561 tab[idx[0]],
1562 tab[idx[0] + 1],
1563 tab[idx[0] + 2],
1564 tab[idx[0] + 3],
1565 tab[idx[1]],
1566 tab[idx[1] + 1],
1567 tab[idx[1] + 2],
1568 tab[idx[1] + 3],
1569 tab[idx[2]],
1570 tab[idx[2] + 1],
1571 tab[idx[2] + 2],
1572 tab[idx[2] + 3],
1573 tab[idx[3]],
1574 tab[idx[3] + 1],
1575 tab[idx[3] + 2],
1576 tab[idx[3] + 3]
1577 };
1578 return v_int8x16(msa_ld1q_s8(elems));
1579}
1580 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1581 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1582 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1583
1584
1585 inline v_int16x8 v_lut(const short* tab, const int* idx)
1586{
1587 short CV_DECL_ALIGNED(32) elems[8] =
1588 {
1589 tab[idx[0]],
1590 tab[idx[1]],
1591 tab[idx[2]],
1592 tab[idx[3]],
1593 tab[idx[4]],
1594 tab[idx[5]],
1595 tab[idx[6]],
1596 tab[idx[7]]
1597 };
1598 return v_int16x8(msa_ld1q_s16(elems));
1599}
1600 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1601{
1602 short CV_DECL_ALIGNED(32) elems[8] =
1603 {
1604 tab[idx[0]],
1605 tab[idx[0] + 1],
1606 tab[idx[1]],
1607 tab[idx[1] + 1],
1608 tab[idx[2]],
1609 tab[idx[2] + 1],
1610 tab[idx[3]],
1611 tab[idx[3] + 1]
1612 };
1613 return v_int16x8(msa_ld1q_s16(elems));
1614}
1615 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1616{
1617 return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
1618}
1619 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1620 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1621 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1622
1623 inline v_int32x4 v_lut(const int* tab, const int* idx)
1624{
1625 int CV_DECL_ALIGNED(32) elems[4] =
1626 {
1627 tab[idx[0]],
1628 tab[idx[1]],
1629 tab[idx[2]],
1630 tab[idx[3]]
1631 };
1632 return v_int32x4(msa_ld1q_s32(elems));
1633}
1634 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1635{
1636 return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
1637}
1638 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1639{
1640 return v_int32x4(msa_ld1q_s32(tab + idx[0]));
1641}
1642 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1643 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1644 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1645
1646 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1647{
1648 return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
1649}
1650 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1651{
1652 return v_int64x2(msa_ld1q_s64(tab + idx[0]));
1653}
1654 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1655 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1656
1657 inline v_float32x4 v_lut(const float* tab, const int* idx)
1658{
1659 float CV_DECL_ALIGNED(32) elems[4] =
1660 {
1661 tab[idx[0]],
1662 tab[idx[1]],
1663 tab[idx[2]],
1664 tab[idx[3]]
1665 };
1666 return v_float32x4(msa_ld1q_f32(elems));
1667}
1668 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1669{
1670 uint64 CV_DECL_ALIGNED(32) elems[2] =
1671 {
1672 *(uint64*)(tab + idx[0]),
1673 *(uint64*)(tab + idx[1])
1674 };
1675 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
1676}
1677 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1678{
1679 return v_float32x4(msa_ld1q_f32(tab + idx[0]));
1680}
1681
1682 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1683{
1684 int CV_DECL_ALIGNED(32) idx[4];
1685 v_store_aligned(idx, idxvec);
1686
1687 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1688}
1689
1690inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1691{
1692 unsigned CV_DECL_ALIGNED(32) elems[4] =
1693 {
1694 tab[msa_getq_lane_s32(idxvec.val, 0)],
1695 tab[msa_getq_lane_s32(idxvec.val, 1)],
1696 tab[msa_getq_lane_s32(idxvec.val, 2)],
1697 tab[msa_getq_lane_s32(idxvec.val, 3)]
1698 };
1699 return v_uint32x4(msa_ld1q_u32(elems));
1700}
1701
1702 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1703{
1704 int CV_DECL_ALIGNED(32) idx[4];
1705 v_store_aligned(idx, idxvec);
1706
1707 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1708}
1709
1710inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1711{
1712 int CV_DECL_ALIGNED(32) idx[4];
1713 v_store_aligned(idx, idxvec);
1714
1715 v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
1716 v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
1717 x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1718 y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1719}
1720
1721inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1722{
1723 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
1724 return c;
1725}
1726 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1727{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1728 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1729{
1730 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
1731 return c;
1732}
1733 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1734
1735 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1736{
1737 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
1738 return c;
1739}
1740
1741 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1742
1743 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1744{
1745 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
1746 return c;
1747}
1748
1749 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1750
1751 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1752{
1753 v_int32x4 c;
1754 c.val[0] = vec.val[0];
1755 c.val[1] = vec.val[2];
1756 c.val[2] = vec.val[1];
1757 c.val[3] = vec.val[3];
1758 return c;
1759}
1760
1761 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1762 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1763
1764 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1765{
1766 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
1767 return c;
1768}
1769
1770 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1771
1772 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1773{
1774 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
1775 return c;
1776}
1777
1778 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1779 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
1780 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
1781 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
1782
1783 inline v_float64x2 v_lut(const double* tab, const int* idx)
1784{
1785 double CV_DECL_ALIGNED(32) elems[2] =
1786 {
1787 tab[idx[0]],
1788 tab[idx[1]]
1789 };
1790 return v_float64x2(msa_ld1q_f64(elems));
1791}
1792
1793 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1794{
1795 return v_float64x2(msa_ld1q_f64(tab + idx[0]));
1796}
1797
1798 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1799{
1800 int CV_DECL_ALIGNED(32) idx[4];
1801 v_store_aligned(idx, idxvec);
1802
1803 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1804}
1805
1806inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1807{
1808 int CV_DECL_ALIGNED(32) idx[4];
1809 v_store_aligned(idx, idxvec);
1810
1811 v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
1812 v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
1813 x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1814 y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1815}
1816
1817template<int i, typename _Tp>
1818inline typename _Tp::lane_type v_extract_n(const _Tp& a)
1819{
1820 return v_rotate_right<i>(a).get0();
1821}
1822
1823 template<int i>
1825{
1826 return v_setall_u32(v_extract_n<i>(a));
1827}
1828 template<int i>
1829 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
1830{
1831 return v_setall_s32(v_extract_n<i>(a));
1832}
1833 template<int i>
1835{
1836 return v_setall_f32(v_extract_n<i>(a));
1837}
1838
1840 #if CV_FP16
1841 inline v_float32x4 v_load_expand(const float16_t* ptr)
1842{
1843 #ifndef msa_ld1_f16
1844 v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
1845 #else
1846 v4f16 v = msa_ld1_f16((const __fp16*)ptr);
1847 #endif
1848 return v_float32x4(msa_cvt_f32_f16(v));
1849}
1850
1851 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
1852{
1853 v4f16 hv = msa_cvt_f16_f32(v.val);
1854
1855 #ifndef msa_st1_f16
1856 msa_st1_s16((short*)ptr, (int16x4_t)hv);
1857 #else
1858 msa_st1_f16((__fp16*)ptr, hv);
1859 #endif
1860}
1861 #else
1862 inline v_float32x4 v_load_expand(const float16_t* ptr)
1863{
1864 float buf[4];
1865 for( int i = 0; i < 4; i++ )
1866 buf[i] = (float)ptr[i];
1867 return v_load(buf);
1868}
1869
1870 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
1871{
1872 float buf[4];
1873 v_store(buf, v);
1874 for( int i = 0; i < 4; i++ )
1875 ptr[i] = (float16_t)buf[i];
1876}
1877 #endif
1878
1879 inline void v_cleanup() {}
1880
1881CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1882
1884
1885}
1886
1887 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75