7
#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8
#define OPENCV_HAL_INTRIN_RISCVV_HPP
12
#include "opencv2/core/utility.hpp"
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
22
#define CV_SIMD128_64F 1
26
typedef
uchar lane_type;
30
explicit
v_uint8x16(vuint8m1_t v) : val(v) {}
31
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
34
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35
val = (vuint8m1_t)vle_v_u8m1((
unsigned
char*)v, 16);
39
return
vmv_x_s_u8m1_u8(val, 16);
47
typedef
schar lane_type;
51
explicit
v_int8x16(vint8m1_t v) : val(v) {}
52
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
55
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56
val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
60
return
vmv_x_s_i8m1_i8(val, 16);
68
typedef
ushort lane_type;
72
explicit
v_uint16x8(vuint16m1_t v) : val(v) {}
73
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
75
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76
val = (vuint16m1_t)vle_v_u16m1((
unsigned
short*)v, 8);
80
return
vmv_x_s_u16m1_u16(val, 8);
88
typedef
short
lane_type;
92
explicit
v_int16x8(vint16m1_t v) : val(v) {}
93
v_int16x8(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7)
95
short
v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96
val = (vint16m1_t)vle_v_i16m1((
signed
short*)v, 8);
100
return
vmv_x_s_i16m1_i16(val, 8);
108
typedef
unsigned
lane_type;
112
explicit
v_uint32x4(vuint32m1_t v) : val(v) {}
113
v_uint32x4(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3)
115
unsigned
v[] = {v0, v1, v2, v3};
116
val = (vuint32m1_t)vle_v_u32m1((
unsigned
int*)v, 4);
118
unsigned
get0()
const
120
return
vmv_x_s_u32m1_u32(val, 4);
128
typedef
int
lane_type;
132
explicit
v_int32x4(vint32m1_t v) : val(v) {}
133
v_int32x4(
int
v0,
int
v1,
int
v2,
int
v3)
135
int
v[] = {v0, v1, v2, v3};
136
val = (vint32m1_t)vle_v_i32m1((
signed
int*)v, 4);
140
return
vmv_x_s_i32m1_i32(val, 4);
147
typedef
float
lane_type;
151
explicit
v_float32x4(vfloat32m1_t v) : val(v) {}
152
v_float32x4(
float
v0,
float
v1,
float
v2,
float
v3)
154
float
v[] = {v0, v1, v2, v3};
155
val = (vfloat32m1_t)vle_v_f32m1((
float*)v, 4);
159
return
vfmv_f_s_f32m1_f32(val, 4);
166
typedef
uint64 lane_type;
170
explicit
v_uint64x2(vuint64m1_t v) : val(v) {}
171
v_uint64x2(uint64 v0, uint64 v1)
173
uint64 v[] = {v0, v1};
174
val = (vuint64m1_t)vle_v_u64m1((
unsigned
long*)v, 2);
178
return
vmv_x_s_u64m1_u64(val, 2);
185
typedef
int64 lane_type;
189
explicit
v_int64x2(vint64m1_t v) : val(v) {}
190
v_int64x2(int64 v0, int64 v1)
192
int64 v[] = {v0, v1};
193
val = (vint64m1_t)vle_v_i64m1((
long*)v, 2);
197
return
vmv_x_s_i64m1_i64(val, 2);
204
typedef
double
lane_type;
208
explicit
v_float64x2(vfloat64m1_t v) : val(v) {}
209
v_float64x2(
double
v0,
double
v1)
211
double
v[] = {v0, v1};
212
val = (vfloat64m1_t)vle_v_f64m1((
double*)v, 2);
216
return
vfmv_f_s_f64m1_f64(val, 2);
221
#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222
inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
223
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225
inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226
inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
227
inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228
inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229
inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
235OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
237OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
239OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
241OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
243OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245
#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
246
inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \
247
inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
249OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
250OPENCV_HAL_IMPL_RISCVV_INIT_SET(
char, int8, s8, i8, 16)
251OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
252OPENCV_HAL_IMPL_RISCVV_INIT_SET(
short, int16, s16, i16, 8)
253OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned
int, uint32, u32, u32, 4)
254OPENCV_HAL_IMPL_RISCVV_INIT_SET(
int, int32, s32, i32, 4)
255OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned
long, uint64, u64, u64, 2)
256OPENCV_HAL_IMPL_RISCVV_INIT_SET(
long, int64, s64, i64, 2)
264
#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
265
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
267
return _Tpvec(intrin(a.val, b.val)); \
269
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
271
a.val = intrin(a.val, b.val); \
275
#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
276
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
278
return _Tpvec(intrin(a.val, b.val, num)); \
280
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
282
a.val = intrin(a.val, b.val, num); \
286OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
287OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint8x16, vssubu_vv_u8m1, 16)
288OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int8x16, vsadd_vv_i8m1, 16)
289OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int8x16, vssub_vv_i8m1, 16)
290OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
291OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint16x8, vssubu_vv_u16m1, 8)
292OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int16x8, vsadd_vv_i16m1, 8)
293OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int16x8, vssub_vv_i16m1, 8)
294OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int32x4, vsadd_vv_i32m1, 4)
295OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int32x4, vssub_vv_i32m1, 4)
296OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_int32x4, vmul_vv_i32m1, 4)
297OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint32x4, vadd_vv_u32m1, 4)
298OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint32x4, vsub_vv_u32m1, 4)
299OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_uint32x4, vmul_vv_u32m1, 4)
300OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int64x2, vsadd_vv_i64m1, 2)
301OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int64x2, vssub_vv_i64m1, 2)
302OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint64x2, vadd_vv_u64m1, 2)
303OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint64x2, vsub_vv_u64m1, 2)
304OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float32x4, vfadd_vv_f32m1, 4)
305OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float32x4, vfsub_vv_f32m1, 4)
306OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float32x4, vfmul_vv_f32m1, 4)
309
return
v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
313
a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
317OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float64x2, vfadd_vv_f64m1, 2)
318OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float64x2, vfsub_vv_f64m1, 2)
319OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float64x2, vfmul_vv_f64m1, 2)
322
return
v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
326
a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
331
#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
332
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
334
return _Tpvec(intrin(a.val, b.val)); \
337
#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
338
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
340
return _Tpvec(intrin(a.val, b.val, num)); \
342OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
343OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
344OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
345OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
346OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
347OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
348OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
349OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
350OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
351OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
352OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
353OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
354OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
355OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
356OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
357OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
366
return
v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
371
v_float32x4
x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
377
return
v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
382
return
v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
387
return
v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
392
return
v_fma(a, b, c);
397
return
v_fma(a, b, c);
404
vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);
405
res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);
406
res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);
407
res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);
415
vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);
416
res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);
417
res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);
418
res = vfadd_vv_f32m1(res, a.val, 4);
429
return
v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
434
v_float64x2
x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
440
return
v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
445
return
v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
450
return
v_fma(a, b, c);
453
#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
454
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
455
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
456
OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
457
inline _Tpvec operator ~ (const _Tpvec & a) \
459
return _Tpvec(vnot_v_##suffix(a.val, num)); \
462OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint8x16, u8m1, 16)
463OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint16x8, u16m1, 8)
464OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint32x4, u32m1, 4)
465OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint64x2, u64m1, 2)
466OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int8x16, i8m1, 16)
467OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int16x8, i16m1, 8)
468OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int32x4, i32m1, 4)
469OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int64x2, i64m1, 2)
471
#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
472
inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
474
return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
476
inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
478
a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
482OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
483OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
484OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
488
return
v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
491
#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
492
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
494
return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
496
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
498
a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
502OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
503OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
504OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
508
return
v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
512
return
v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
516
return
v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
529
vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
530
return
v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
535
vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
536
return
v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
541
vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
542
return
v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
547
return
(
v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
552
return
(
v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
557
vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
563
vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
567
#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
568
inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
569
vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
570
vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
571
return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
574OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
575OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
576OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
580
vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
581
vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
582
return
v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
585
vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
586
vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
587
return
v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
590
#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
591
inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
592
vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
593
vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
594
return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \
597OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
598OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
599OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
605
vint16m2_t res = vundefined_i16m2();
606
res = vwmul_vv_i16m2(a.val, b.val, 16);
607
c.val = vget_i16m2_i16m1(res, 0);
608
d.val = vget_i16m2_i16m1(res, 1);
614
vuint16m2_t res = vundefined_u16m2();
615
res = vwmulu_vv_u16m2(a.val, b.val, 16);
616
c.val = vget_u16m2_u16m1(res, 0);
617
d.val = vget_u16m2_u16m1(res, 1);
623
vint32m2_t res = vundefined_i32m2();
624
res = vwmul_vv_i32m2(a.val, b.val, 8);
625
c.val = vget_i32m2_i32m1(res, 0);
626
d.val = vget_i32m2_i32m1(res, 1);
632
vuint32m2_t res = vundefined_u32m2();
633
res = vwmulu_vv_u32m2(a.val, b.val, 8);
634
c.val = vget_u32m2_u32m1(res, 0);
635
d.val = vget_u32m2_u32m1(res, 1);
641
vint64m2_t res = vundefined_i64m2();
642
res = vwmul_vv_i64m2(a.val, b.val, 4);
643
c.val = vget_i64m2_i64m1(res, 0);
644
d.val = vget_i64m2_i64m1(res, 1);
650
vuint64m2_t res = vundefined_u64m2();
651
res = vwmulu_vv_u64m2(a.val, b.val, 4);
652
c.val = vget_u64m2_u64m1(res, 0);
653
d.val = vget_u64m2_u64m1(res, 1);
656OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
657OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
658OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
659OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
660OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
661OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
662OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
663OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
664OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
665OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
666OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
667OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
672
vint32m2_t res = vundefined_i32m2();
673
res = vwmul_vv_i32m2(a.val, b.val, 8);
674
res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
675
return
v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
679
vint32m2_t res = vundefined_i32m2();
680
res = vwmul_vv_i32m2(a.val, b.val, 8);
681
res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
682
return
v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
688
vint64m2_t res = vundefined_i64m2();
689
res = vwmul_vv_i64m2(a.val, b.val, 4);
690
res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
691
return
v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
695
vint64m2_t res = vundefined_i64m2();
696
res = vwmul_vv_i64m2(a.val, b.val, 4);
697
res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
698
return
v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
704
vuint16m2_t v1 = vundefined_u16m2();
705
vuint32m2_t v2 = vundefined_u32m2();
706
v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
707
v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
708
v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
709
return
v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
715
vuint16m2_t v1 = vundefined_u16m2();
716
vuint32m2_t v2 = vundefined_u32m2();
717
v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
718
v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
719
v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
720
return
v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
725
vint16m2_t v1 = vundefined_i16m2();
726
vint32m2_t v2 = vundefined_i32m2();
727
v1 = vwmul_vv_i16m2(a.val, b.val, 16);
728
v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
729
v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
730
return
v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
736
vint16m2_t v1 = vundefined_i16m2();
737
vint32m2_t v2 = vundefined_i32m2();
738
v1 = vwmul_vv_i16m2(a.val, b.val, 16);
739
v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
740
v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
741
return
v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
746
vuint32m2_t v1 = vundefined_u32m2();
747
vuint64m2_t v2 = vundefined_u64m2();
748
v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
749
v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
750
v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
751
return
v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
757
vuint32m2_t v1 = vundefined_u32m2();
758
vuint64m2_t v2 = vundefined_u64m2();
759
v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
760
v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
761
v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
762
return
v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
767
vint32m2_t v1 = vundefined_i32m2();
768
vint64m2_t v2 = vundefined_i64m2();
769
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
770
v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
771
v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
772
return
v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
778
vint32m2_t v1 = vundefined_i32m2();
779
vint64m2_t v2 = vundefined_i64m2();
780
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
781
v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
782
v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
783
return
v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
790
vint32m2_t v1 = vundefined_i32m2();
791
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
792
return
v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
797
vint32m2_t v1 = vundefined_i32m2();
798
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
799
return
v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
805
vint64m2_t v1 = vundefined_i64m2();
806
v1 = vwmul_vv_i64m2(a.val, b.val, 4);
807
return
v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
811
vint64m2_t v1 = vundefined_i64m2();
812
v1 = vwmul_vv_i64m2(a.val, b.val, 8);
813
return
v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
819
vuint16m2_t v1 = vundefined_u16m2();
820
vuint32m2_t v2 = vundefined_u32m2();
821
v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
822
v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
823
return
v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
828
vuint16m2_t v1 = vundefined_u16m2();
829
vuint32m2_t v2 = vundefined_u32m2();
830
v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
831
v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
832
return
v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
837
vint16m2_t v1 = vundefined_i16m2();
838
vint32m2_t v2 = vundefined_i32m2();
839
v1 = vwmul_vv_i16m2(a.val, b.val, 16);
840
v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
841
return
v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
845
vint16m2_t v1 = vundefined_i16m2();
846
vint32m2_t v2 = vundefined_i32m2();
847
v1 = vwmul_vv_i16m2(a.val, b.val, 16);
848
v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
849
return
v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
855
vuint32m2_t v1 = vundefined_u32m2();
856
vuint64m2_t v2 = vundefined_u64m2();
857
v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
858
v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
859
return
v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
863
vuint32m2_t v1 = vundefined_u32m2();
864
vuint64m2_t v2 = vundefined_u64m2();
865
v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
866
v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
867
return
v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
872
vint32m2_t v1 = vundefined_i32m2();
873
vint64m2_t v2 = vundefined_i64m2();
874
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
875
v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
876
return
v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
880
vint32m2_t v1 = vundefined_i32m2();
881
vint64m2_t v2 = vundefined_i64m2();
882
v1 = vwmul_vv_i32m2(a.val, b.val, 8);
883
v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
884
return
v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
888
#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
889
inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
891
v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
892
val = intrin(val, a.val, val, num); \
893
return vmv_x_s_##len##m1_##len(val, num); \
897
#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
898
inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
900
v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
901
val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
904OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16,
int, sum, vwredsum_vs_i8m1_i16m1, 16)
905OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32,
int, sum, vwredsum_vs_i16m1_i32m1, 8)
906OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64,
int, sum, vwredsum_vs_i32m1_i64m1, 4)
907OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16,
unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
908OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32,
unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
909OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64,
unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
912
vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
913
val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \
914
return vfmv_f_s_f32m1_f32(val, 4); \
918
vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
919
val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \
920
return vfmv_f_s_f64m1_f64(val, 2); \
923{
return
vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
926{
return
vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
928
#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
929
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \
930
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \
931
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \
932
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \
933
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \
934
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \
935
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \
936
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
937OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
max)
938OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
min)
943
vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
944
vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
945
vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
946
vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
947
a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
948
b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
949
c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
950
d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
956
vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
957
vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
958
vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
959
vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
960
a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
964
#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
965
inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
966
_Tpvec2 x = v_absdiff(a, b); \
967
return v_reduce_sum(x); \
977
#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
978
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
980
vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
981
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
983
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985
vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
986
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
988
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
990
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
991
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
993
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
995
vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
996
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
998
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1000
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1001
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1003
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1005
vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1006
return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1009OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int8x16, i8m1, 8, 16, _vv_)
1010OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int16x8, i16m1, 16, 8, _vv_)
1011OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int32x4, i32m1, 32, 4, _vv_)
1012OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int64x2, i64m1, 64, 2, _vv_)
1013OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint8x16, u8m1, 8, 16, u_vv_)
1014OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint16x8, u16m1, 16, 8, u_vv_)
1015OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint32x4, u32m1, 32, 4, u_vv_)
1016OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint64x2, u64m1, 64, 2, u_vv_)
1021
vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1022
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1027
vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1028
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1033
vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1034
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1039
vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1040
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1045
vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1046
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1051
vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1052
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1057
vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
1058
vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1065
vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1066
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1071
vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1072
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1077
vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1078
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1083
vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1084
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1089
vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1090
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1095
vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1096
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1101
vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
1102
vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1105
#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1106
inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1107
const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1108
v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1109
v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1111
v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1112
val = vset_##_T##m4(val, 0, a0.val); \
1113
val = vset_##_T##m4(val, 1, a1.val); \
1114
val = vset_##_T##m4(val, 2, a2.val); \
1115
val = vset_##_T##m4(val, 3, a3.val); \
1116
val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \
1117
b0.val = vget_##_T##m4_##_T##m1(val, 0); \
1118
b1.val = vget_##_T##m4_##_T##m1(val, 1); \
1119
b2.val = vget_##_T##m4_##_T##m1(val, 2); \
1120
b3.val = vget_##_T##m4_##_T##m1(val, 3); \
1122OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1123OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
int, i32)
1124OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
float, f32)
1127
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1128
inline _Tpvec operator << (const _Tpvec& a, int n) \
1129
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1130
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1131
{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1133
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1134
inline _Tpvec operator >> (const _Tpvec& a, int n) \
1135
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1136
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1137
{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1138
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1139
{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1142
#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1143
OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1144
OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1146OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1147OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1148OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1149OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1150OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1151OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1152OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1153OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1156
#define VUP4(n) {0, 1, 2, 3}
1157
#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1158
#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1159
#define VUP2(n) {0, 1}
1161
#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1162
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1164
suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1165
tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1166
return _Tpvec(tmp);\
1168
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1170
return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
1172
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1174
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1176
suffix##m2_t tmp = vundefined_##_T##m2(); \
1177
tmp = vset_##_T##m2(tmp, 0, a.val); \
1178
tmp = vset_##_T##m2(tmp, 1, b.val); \
1179
tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
1180
return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
1182
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1184
suffix##m2_t tmp = vundefined_##_T##m2(); \
1185
tmp = vset_##_T##m2(tmp, 0, b.val); \
1186
tmp = vset_##_T##m2(tmp, 1, a.val); \
1187
tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
1188
return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
1190
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1192
CV_UNUSED(b); return a; \
1195OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1196OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1197OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1198OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1199OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1200OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1201OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1202OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1203OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1204OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1206
#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
1207
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1209
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1210
vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
1211
return _Tpvec(_Tp2##_t(tmp)); } \
1212
inline _Tpvec v_load_low(const _Tp* ptr) \
1213
{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
1214
inline _Tpvec v_load_aligned(const _Tp* ptr) \
1215
{ return _Tpvec(vle_v_##len(ptr, num)); } \
1216
inline _Tpvec v_load(const _Tp* ptr) \
1217
{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
1218
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1219
{ vse_v_##len(ptr, a.val, hnum);}\
1220
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1222
_Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \
1223
vse_v_##len(ptr, a0, hnum);}\
1224
inline void v_store(_Tp* ptr, const _Tpvec& a) \
1225
{ vse_v_##len(ptr, a.val, num); } \
1226
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1227
{ vse_v_##len(ptr, a.val, num); } \
1228
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1229
{ vse_v_##len(ptr, a.val, num); } \
1230
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode
) \
1231
{ vse_v_##len(ptr, a.val, num); }
1233OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
1234OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16, schar, vint8m1, i8m1, 8, 16)
1235OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
1236OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8)
1237OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4)
1238OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4)
1239OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned
long, vuint64m1, u64m1, 1, 2)
1240OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2)
1241OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4)
1242OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2)
1247inline
v_int8x16
v_lut(const schar* tab, const
int* idx)
1250
schar CV_DECL_ALIGNED(32) elems[16] =
1269
return
v_int8x16(vle_v_i8m1(elems, 16));
1271
int32xm4_t index32 = vlev_int32xm4(idx, 16);
1272
vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
1273
vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
1274
return
v_int8x16(vlxbv_i8m1(tab, index, 16));
1278
inline
v_int8x16
v_lut_pairs(
const
schar* tab,
const
int* idx){
1279
schar CV_DECL_ALIGNED(32) elems[16] =
1298
return
v_int8x16(vle_v_i8m1(elems, 16));
1300
inline
v_int8x16
v_lut_quads(
const
schar* tab,
const
int* idx)
1302
schar CV_DECL_ALIGNED(32) elems[16] =
1321
return
v_int8x16(vle_v_i8m1(elems, 16));
1324
inline
v_uint8x16
v_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1325
inline
v_uint8x16
v_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1326
inline
v_uint8x16
v_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1328
inline
v_int16x8
v_lut(
const
short* tab,
const
int* idx)
1330
short
CV_DECL_ALIGNED(32) elems[8] =
1341
return
v_int16x8(vle_v_i16m1(elems, 8));
1343
inline
v_int16x8
v_lut_pairs(
const
short* tab,
const
int* idx)
1345
short
CV_DECL_ALIGNED(32) elems[8] =
1356
return
v_int16x8(vle_v_i16m1(elems, 8));
1358
inline
v_int16x8
v_lut_quads(
const
short* tab,
const
int* idx)
1360
short
CV_DECL_ALIGNED(32) elems[8] =
1371
return
v_int16x8(vle_v_i16m1(elems, 8));
1373
inline
v_uint16x8
v_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut((
short*)tab, idx)); }
1374
inline
v_uint16x8
v_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_pairs((
short*)tab, idx)); }
1375
inline
v_uint16x8
v_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_quads((
short*)tab, idx)); }
1377
inline
v_int32x4
v_lut(
const
int* tab,
const
int* idx)
1379
int
CV_DECL_ALIGNED(32) elems[4] =
1386
return
v_int32x4(vle_v_i32m1(elems, 4));
1388
inline
v_int32x4
v_lut_pairs(
const
int* tab,
const
int* idx)
1390
int
CV_DECL_ALIGNED(32) elems[4] =
1397
return
v_int32x4(vle_v_i32m1(elems, 4));
1399
inline
v_int32x4
v_lut_quads(
const
int* tab,
const
int* idx)
1401
return
v_int32x4(vle_v_i32m1(tab+idx[0], 4));
1403
inline
v_uint32x4
v_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut((
int*)tab, idx)); }
1404
inline
v_uint32x4
v_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_pairs((
int*)tab, idx)); }
1405
inline
v_uint32x4
v_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_quads((
int*)tab, idx)); }
1407
inline
v_int64x2
v_lut(
const
int64_t* tab,
const
int* idx)
1409
vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1412
inline
v_int64x2
v_lut_pairs(
const
int64_t* tab,
const
int* idx)
1414
return
v_int64x2(vle_v_i64m1(tab+idx[0], 2));
1417
inline
v_uint64x2
v_lut(
const
uint64_t* tab,
const
int* idx)
1419
vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1422
inline
v_uint64x2
v_lut_pairs(
const
uint64_t* tab,
const
int* idx)
1424
return
v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
1427
inline
v_float32x4
v_lut(
const
float* tab,
const
int* idx)
1429
float
CV_DECL_ALIGNED(32) elems[4] =
1438
inline
v_float32x4
v_lut_pairs(
const
float* tab,
const
int* idx)
1440
float
CV_DECL_ALIGNED(32) elems[4] =
1449
inline
v_float32x4
v_lut_quads(
const
float* tab,
const
int* idx)
1453
inline
v_float64x2
v_lut(
const
double* tab,
const
int* idx)
1455
vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1458
inline
v_float64x2
v_lut_pairs(
const
double* tab,
const
int* idx)
1465
int
CV_DECL_ALIGNED(32) elems[4] =
1472
return
v_int32x4(vle_v_i32m1(elems, 4));
1477
unsigned
CV_DECL_ALIGNED(32) elems[4] =
1489
float
CV_DECL_ALIGNED(32) elems[4] =
1500
vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1505
vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
1506
vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1508
x.val = vlxe_v_f32m1(tab, index_x, 4);
1509
y.val = vlxe_v_f32m1(tab, index_y, 4);
1514
int
CV_DECL_ALIGNED(32) idx[4];
1521
#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
1522
inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1524
v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1525
tmp = vset_##_T2##m2(tmp, 0, a.val); \
1526
tmp = vset_##_T2##m2(tmp, 1, b.val); \
1527
return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1529
template<int n> inline \
1530
v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1532
v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1533
tmp = vset_##_T2##m2(tmp, 0, a.val); \
1534
tmp = vset_##_T2##m2(tmp, 1, b.val); \
1535
return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1537
inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1539
v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1540
tmp = vset_##_T2##m2(tmp, 0, a.val); \
1541
tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1542
asm(""
::: "memory"); \
1543
vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1545template<int n> inline \
1546void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1548
v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1549
tmp = vset_##_T2##m2(tmp, 0, a.val); \
1550
tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1551
vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1553OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx,
signed
char)
1554OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx,
signed
short)
1555OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx,
int)
1556OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx,
unsigned
char)
1557OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx,
unsigned
short)
1558OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx,
unsigned
int)
1563
vuint16m2_t tmp = vundefined_u16m2(); \
1564
tmp = vset_u16m2(tmp, 0, a.val); \
1565
tmp = vset_u16m2(tmp, 1, b.val); \
1566
return
v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
1572
vuint32m4_t vabcd = vundefined_u32m4(); \
1573
vuint16m2_t v16 = vundefined_u16m2(); \
1574
vabcd = vset_u32m4(vabcd, 0, a.val); \
1575
vabcd = vset_u32m4(vabcd, 1, b.val); \
1576
vabcd = vset_u32m4(vabcd, 2, c.val); \
1577
vabcd = vset_u32m4(vabcd, 3, d.val); \
1578
v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
1579
return
v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1586
vuint64m8_t v64 = vundefined_u64m8(); \
1587
vuint32m4_t v32 = vundefined_u32m4(); \
1588
vuint16m2_t v16 = vundefined_u16m2(); \
1589
v64 = vset_u64m8(v64, 0, a.val); \
1590
v64 = vset_u64m8(v64, 1, b.val); \
1591
v64 = vset_u64m8(v64, 2, c.val); \
1592
v64 = vset_u64m8(v64, 3, d.val); \
1593
v64 = vset_u64m8(v64, 4, e.val); \
1594
v64 = vset_u64m8(v64, 5, f.val); \
1595
v64 = vset_u64m8(v64, 6, g.val); \
1596
v64 = vset_u64m8(v64, 7, h.val); \
1597
v32 = vnsrl_vx_u32m4(v64, 0, 16);
1598
v16 = vnsrl_vx_u16m2(v32, 0, 16);
1599
return
v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1611
#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
1612
inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1614
vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1615
tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1616
tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1617
vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1618
return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \
1620
inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1622
vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1623
tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1624
vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1625
return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \
1627
template<int n> inline \
1628
v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1630
vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1631
tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1632
tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1633
vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1634
return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \
1636
template<int n> inline \
1637
void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1639
vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1640
tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1641
vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1642
vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \
1643
return vse_v_u##tp1##m1(ptr, val, num2);\
1645OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8,
unsigned
char
)
1646OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4,
unsigned
short)
1649
#pragma GCC diagnostic push
1650
#pragma GCC diagnostic ignored "-Wuninitialized"
1654
#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \
1655
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
1658
v_mul_expand(a, b, c, d); \
1659
return v_pack(c, d); \
1661
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
1662
{ a = a * b; return a; }
1670
#pragma GCC diagnostic pop
1672
static
const
signed
char
popCountTable[256] =
1674
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1675
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1676
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1677
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1678
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1679
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1680
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1681
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1682
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1683
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1684
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1685
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1686
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1687
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1688
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1689
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1692
inline
vuint8m1_t vcnt_u8(vuint8m1_t val){
1693
vuint8m1_t v0 = val & 1;
1694
return
vlxe_v_u8m1((
unsigned
char*)popCountTable, val >> 1, 16)+v0;
1706
return
v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
1712
vuint8m2_t tmp = vundefined_u8m2();
1713
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1714
vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1715
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1716
vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1723
vuint8m2_t tmp = vundefined_u8m2();
1724
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1725
vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1726
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1727
vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1734
vuint8m2_t tmp = vundefined_u8m2();
1735
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1736
vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1737
0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1738
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1739
vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1740
vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1747
vuint8m2_t tmp = vundefined_u8m2();
1748
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1749
vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1750
0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1751
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1752
vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1753
vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1760
vuint8m2_t tmp = vundefined_u8m2();
1761
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1762
vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1763
0x0F0E0D0C0B0A0908, 0x0000000000000000};
1764
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1765
vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1766
vuint8m1_t res1 = zero;
1767
vuint8m1_t res2 = zero;
1768
res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1769
res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1771
return
v_uint64x2((
unsigned
long)vmv_x_s_u8m1_u8(res1, 8), (
unsigned
long)vmv_x_s_u8m1_u8(res2, 8));
1777
vuint8m2_t tmp = vundefined_u8m2();
1778
tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1779
vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1780
0x0F0E0D0C0B0A0908, 0x0000000000000000};
1781
tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1782
vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1783
vuint8m1_t res1 = zero;
1784
vuint8m1_t res2 = zero;
1785
res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1786
res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1788
return
v_uint64x2((
unsigned
long)vmv_x_s_u8m1_u8(res1, 8), (
unsigned
long)vmv_x_s_u8m1_u8(res2, 8));
1791
#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
1794
vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
1795
vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1796
vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
1797
vuint32m1_t res = vmv_v_x_u32m1(0, 4);
1798
vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
1799
res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
1800
res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
1801
return
vmv_x_s_u32m1_u32(res, 8);
1805
vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
1806
vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1807
vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
1808
vint32m1_t res = vmv_v_x_i32m1(0, 4);
1809
vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
1810
res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
1811
res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
1812
return
vmv_x_s_i32m1_i32(res, 8);
1817
vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1818
vint16m1_t m1 = (vint16m1_t){SMASK};
1819
vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1820
vint16m1_t res = vmv_v_x_i16m1(0, 8);
1821
res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1822
return
vmv_x_s_i16m1_i16(res, 8);
1826
vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1827
vint16m1_t m1 = (vint16m1_t){SMASK};
1828
vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1829
vint16m1_t res = vmv_v_x_i16m1(0, 8);
1830
res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1831
return
vmv_x_s_i16m1_i16(res, 8);
1835
vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1836
vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1837
vint32m1_t res = vmv_v_x_i32m1(0, 4);
1838
vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1839
res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1840
return
vmv_x_s_i32m1_i32(res, 4);
1844
vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
1845
vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1846
vint32m1_t res = vmv_v_x_i32m1(0, 4);
1847
vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1848
res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1849
return
vmv_x_s_i32m1_i32(res, 4);
1853
vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
1854
int
res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
1858{
return
v_signmask(v_reinterpret_as_u64(a)); }
1860{
return
v_signmask(v_reinterpret_as_u64(a)); }
1863
vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1864
vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1865
vint32m1_t res = vmv_v_x_i32m1(0, 4);
1866
vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1867
res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1868
return
vmv_x_s_i32m1_i32(res, 4);
1873
if(val==0)
return
0;
1874
else
return
trailingZeros32(val); }
1877
if(val==0)
return
0;
1878
else
return
trailingZeros32(val); }
1881
if(val==0)
return
0;
1882
else
return
trailingZeros32(val); }
1885
if(val==0)
return
0;
1886
else
return
trailingZeros32(val); }
1889
if(val==0)
return
0;
1890
else
return
trailingZeros32(val); }
1893
if(val==0)
return
0;
1894
else
return
trailingZeros32(val); }
1897
if(val==0)
return
0;
1898
else
return
trailingZeros32(val); }
1901
if(val==0)
return
0;
1902
else
return
trailingZeros32(val); }
1905
if(val==0)
return
0;
1906
else
return
trailingZeros32(val); }
1908
#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
1909
inline bool v_check_all(const v_##_Tpvec& a) \
1911
suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
1912
vuint64m1_t v1 = vuint64m1_t(v0); \
1913
return (v1[0] | v1[1]) == 0; \
1915
inline bool v_check_any(const v_##_Tpvec& a) \
1917
suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
1918
vuint64m1_t v1 = vuint64m1_t(v0); \
1919
return (v1[0] | v1[1]) != 0; \
1922OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16)
1923OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
1924OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
1925OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
1953
#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
1954
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1956
return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
1959OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int8x16, i8m1, vbool8_t, 16)
1960OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int16x8, i16m1, vbool16_t, 8)
1961OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int32x4, i32m1, vbool32_t, 4)
1962OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint8x16, u8m1, vbool8_t, 16)
1963OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint16x8, u16m1, vbool16_t, 8)
1964OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint32x4, u32m1, vbool32_t, 4)
1967
return
v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
1971
return
v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
1974
#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
1975
inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
1977
_T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1978
b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \
1979
b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \
1981
inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
1983
_T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \
1984
return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1986
inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
1988
_T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1989
return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
1991
inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
1993
_T2##_t val = vle##_v_##_Tp1(ptr, num2); \
1994
_T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \
1995
return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1998OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
1999OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1)
2000OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1)
2001OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1)
2002OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int16x8, int32x4,
short, i16m1, 8, i32, 4, vint32m2, vint16m1)
2003OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int32x4, int64x2,
int, i32m1, 4, i64, 2, vint64m2, vint32m1)
2007
vuint16m2_t b = vundefined_u16m2();
2008
vuint32m2_t c = vundefined_u32m2();
2009
vuint8m1_t val = vle_v_u8m1(ptr, 4); \
2010
b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2011
c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2017
vint16m2_t b = vundefined_i16m2();
2018
vint32m2_t c = vundefined_i32m2();
2019
vint8m1_t val = vle_v_i8m1(ptr, 4); \
2020
b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2021
c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2022
return
v_int32x4(vget_i32m2_i32m1(c, 0));
2024
#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
2025
#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
2026
#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
2027
#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
2028
#define LOW_4 0x0000000100000000, 0x0000000500000004
2029
#define LOW_8 0x0003000200010000, 0x000B000A00090008
2030
#define LOW_16 0x0706050403020100, 0x1716151413121110
2031
#define HIGH_4 0x0000000300000002, 0x0000000700000006
2032
#define HIGH_8 0x0007000600050004, 0x000F000E000D000C
2033
#define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918
2034
#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
2035
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2037
v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2038
tmp = vset_##_T##m2(tmp, 0, a0.val); \
2039
tmp = vset_##_T##m2(tmp, 1, a1.val); \
2040
vuint64m2_t mask = VITL_##num; \
2041
tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
2042
b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
2043
b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
2045
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2047
v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2048
return v_##_Tpvec(b0);\
2050
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2052
v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2053
v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2054
v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2055
return v_##_Tpvec(b1);\
2057
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2059
c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2060
v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2061
v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2062
d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2065OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
2066OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
2067OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
2068OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
2069OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
2070OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
2071OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
2072OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
2076
vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2077
return
v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
2081
vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2082
return
v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
2087
vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
2088
return
v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
2093
vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
2094
return
v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
2098
return
v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2103
return
v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2107{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
2124
#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2126
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2127
{ return v_rotate_right<n>(a, b);}
2128OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint8x16, u8, 0)
2129OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int8x16, s8, 0)
2130OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint16x8, u16, 1)
2131OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int16x8, s16, 1)
2132OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint32x4, u32, 2)
2133OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int32x4, s32, 2)
2134OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint64x2, u64, 3)
2135OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int64x2, s64, 3)
2136OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float32x4, f32, 2)
2137OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float64x2, f64, 3)
2140
#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
2141
template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
2143OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint8x16, uchar, u8)
2144OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int8x16, schar, s8)
2145OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint16x8, ushort, u16)
2146OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int16x8,
short, s16)
2147OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint32x4, uint, u32)
2148OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int32x4,
int, s32)
2149OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint64x2, uint64, u64)
2150OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int64x2, int64, s64)
2151OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float32x4,
float, f32)
2152OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float64x2,
double, f64)
2154
#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2155
template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2157OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint8x16, u8, 16)
2158OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int8x16, i8, 16)
2159OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint16x8, u16, 8)
2160OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int16x8, i16, 8)
2161OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint32x4, u32, 4)
2162OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int32x4, i32, 4)
2163OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint64x2, u64, 2)
2164OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int64x2, i64, 2)
2165OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_float32x4, f32, 4)
2168
__builtin_riscv_fsrm(0);
2169
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2170
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2171
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2172
__builtin_riscv_fsrm(0);
2177
__builtin_riscv_fsrm(2);
2178
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2179
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2180
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2181
__builtin_riscv_fsrm(0);
2187
__builtin_riscv_fsrm(3);
2188
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2189
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2190
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2191
__builtin_riscv_fsrm(0);
2197
__builtin_riscv_fsrm(1);
2198
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2199
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2200
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2201
__builtin_riscv_fsrm(0);
2207
__builtin_riscv_fsrm(0);
2208
vfloat64m2_t _val = vundefined_f64m2();
2209
_val = vset_f64m2(_val, 0, a.val);
2211
_val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2212
vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2213
__builtin_riscv_fsrm(0);
2218
__builtin_riscv_fsrm(0);
2219
vfloat64m2_t _val = vundefined_f64m2();
2220
_val = vset_f64m2(_val, 0, a.val);
2221
_val = vset_f64m2(_val, 1, b.val);
2222
vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2223
__builtin_riscv_fsrm(0);
2228
__builtin_riscv_fsrm(2);
2229
vfloat64m2_t _val = vundefined_f64m2();
2230
_val = vset_f64m2(_val, 0, a.val);
2231
vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2233
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2234
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2235
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2236
__builtin_riscv_fsrm(0);
2242
__builtin_riscv_fsrm(3);
2243
vfloat64m2_t _val = vundefined_f64m2();
2244
_val = vset_f64m2(_val, 0, a.val);
2245
vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2247
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2248
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2249
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2250
__builtin_riscv_fsrm(0);
2256
__builtin_riscv_fsrm(1);
2257
vfloat64m2_t _val = vundefined_f64m2();
2258
_val = vset_f64m2(_val, 0, a.val);
2259
vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2261
vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2262
vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2263
vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2264
__builtin_riscv_fsrm(0);
2268
#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2269
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2271
v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
2272
a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2273
b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2275
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2277
v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
2278
a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2279
b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2280
c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2282
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2283
v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2285
v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
2286
a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2287
b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2288
c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2289
d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2292
#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2293
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2294
hal::StoreMode
=hal::STORE_UNALIGNED) \
2296
v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2297
ret = vset_##_T##m1x2(ret, 0, a.val); \
2298
ret = vset_##_T##m1x2(ret, 1, b.val); \
2299
intrin##2e_v_##_T##m1x2(ptr, ret, num); \
2301
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2302
const v_##_Tpvec##x##num& c, hal::StoreMode
=hal::STORE_UNALIGNED) \
2304
v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2305
ret = vset_##_T##m1x3(ret, 0, a.val); \
2306
ret = vset_##_T##m1x3(ret, 1, b.val); \
2307
ret = vset_##_T##m1x3(ret, 2, c.val); \
2308
intrin##3e_v_##_T##m1x3(ptr, ret, num); \
2310
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2311
const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2312
hal::StoreMode
=hal::STORE_UNALIGNED ) \
2314
v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2315
ret = vset_##_T##m1x4(ret, 0, a.val); \
2316
ret = vset_##_T##m1x4(ret, 1, b.val); \
2317
ret = vset_##_T##m1x4(ret, 2, c.val); \
2318
ret = vset_##_T##m1x4(ret, 3, d.val); \
2319
intrin##4e_v_##_T##m1x4(ptr, ret, num); \
2322
#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
2323
OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \
2324
OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
2327OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
2328OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16,
short, 8, vlseg, vsseg, i16)
2329OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32,
int, 4, vlseg, vsseg, i32)
2331OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8,
unsigned
char, 16, vlseg, vsseg, u8)
2332OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16,
unsigned
short, 8, vlseg, vsseg, u16)
2333OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32,
unsigned
int, 4, vlseg, vsseg, u32)
2335
#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
2336
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2338
v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
2339
a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2340
b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2342
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2344
v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \
2345
a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2346
b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2347
c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2349
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2350
v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2352
v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \
2353
a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2354
b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2355
c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2356
d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2358
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2359
hal::StoreMode
=hal::STORE_UNALIGNED) \
2361
v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2362
ret = vset_##_T##m1x2(ret, 0, a.val); \
2363
ret = vset_##_T##m1x2(ret, 1, b.val); \
2364
vsseg2e_v_##_T##m1x2(ptr, ret, num); \
2366
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2367
const v_##_Tpvec##x##num& c, hal::StoreMode
=hal::STORE_UNALIGNED) \
2369
v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2370
ret = vset_##_T##m1x3(ret, 0, a.val); \
2371
ret = vset_##_T##m1x3(ret, 1, b.val); \
2372
ret = vset_##_T##m1x3(ret, 2, c.val); \
2373
vsseg3e_v_##_T##m1x3(ptr, ret, num); \
2375
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2376
const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2377
hal::StoreMode
=hal::STORE_UNALIGNED ) \
2379
v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2380
ret = vset_##_T##m1x4(ret, 0, a.val); \
2381
ret = vset_##_T##m1x4(ret, 1, b.val); \
2382
ret = vset_##_T##m1x4(ret, 2, c.val); \
2383
ret = vset_##_T##m1x4(ret, 3, d.val); \
2384
vsseg4e_v_##_T##m1x4(ptr, ret, num); \
2386OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32,
float, 4, f32)
2387OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64,
double, 2, f64)
2389OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64,
unsigned
long, 2, u64)
2390OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64,
long, 2, i64)
2400
vfloat64m2_t _val = vundefined_f64m2();
2401
_val = vset_f64m2(_val, 0, a.val);
2402
vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2408
vfloat64m2_t _val = vundefined_f64m2();
2409
_val = vset_f64m2(_val, 0, a.val);
2410
_val = vset_f64m2(_val, 1, b.val);
2411
vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
2417
vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2418
vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2424
vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2425
vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2431
vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2437
vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2449
vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2450
return
v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2454
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2459
vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2460
return
v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2464
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2469
vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2470
return
v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
2472
inline
v_uint16x8
v_interleave_pairs(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2475
vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2476
return
v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2478
inline
v_uint16x8
v_interleave_quads(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2482
vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2483
return
v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2485
inline
v_uint32x4
v_interleave_pairs(
const
v_uint32x4& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2486
inline
v_float32x4
v_interleave_pairs(
const
v_float32x4& vec) {
return
v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2489
vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2490
return
v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2492
inline
v_uint8x16
v_pack_triplets(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2496
vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2497
return
v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2499
inline
v_uint16x8
v_pack_triplets(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2513
vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2514
vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
2524
vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
2525
vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2529
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
2531
vfloat32m2_t v32 = vundefined_f32m2();
2532
v32 = vset_f32m2(v32, 0, v.val);
2533
vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
2534
vse_v_f16m1((__fp16*)ptr, hv, 4);
2538
inline
void
v_cleanup() {}
2540CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition:
intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition:
intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition:
intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition:
intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition:
intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition:
intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition:
intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition:
intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition:
intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition:
intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition:
intrin_cpp.hpp:1454
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition:
intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition:
intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition:
intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition:
intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition:
intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition:
intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition:
intrin_cpp.hpp:502
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75