7#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8#define OPENCV_HAL_INTRIN_RISCVV_HPP
12#include "opencv2/core/utility.hpp"
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
22#define CV_SIMD128_64F 1
26 typedef uchar lane_type;
30 explicit v_uint8x16(vuint8m1_t v) : val(v) {}
31 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle_v_u8m1((
unsigned char*)v, 16);
39 return vmv_x_s_u8m1_u8(val, 16);
47 typedef schar lane_type;
51 explicit v_int8x16(vint8m1_t v) : val(v) {}
52 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
60 return vmv_x_s_i8m1_i8(val, 16);
68 typedef ushort lane_type;
72 explicit v_uint16x8(vuint16m1_t v) : val(v) {}
73 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle_v_u16m1((
unsigned short*)v, 8);
80 return vmv_x_s_u16m1_u16(val, 8);
88 typedef short lane_type;
92 explicit v_int16x8(vint16m1_t v) : val(v) {}
93 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle_v_i16m1((
signed short*)v, 8);
100 return vmv_x_s_i16m1_i16(val, 8);
108 typedef unsigned lane_type;
112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle_v_u32m1((
unsigned int*)v, 4);
118 unsigned get0()
const
120 return vmv_x_s_u32m1_u32(val, 4);
128 typedef int lane_type;
132 explicit v_int32x4(vint32m1_t v) : val(v) {}
133 v_int32x4(
int v0,
int v1,
int v2,
int v3)
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle_v_i32m1((
signed int*)v, 4);
140 return vmv_x_s_i32m1_i32(val, 4);
147 typedef float lane_type;
151 explicit v_float32x4(vfloat32m1_t v) : val(v) {}
152 v_float32x4(
float v0,
float v1,
float v2,
float v3)
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle_v_f32m1((
float*)v, 4);
159 return vfmv_f_s_f32m1_f32(val, 4);
166 typedef uint64 lane_type;
170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
171 v_uint64x2(uint64 v0, uint64 v1)
173 uint64 v[] = {v0, v1};
174 val = (vuint64m1_t)vle_v_u64m1((
unsigned long*)v, 2);
178 return vmv_x_s_u64m1_u64(val, 2);
185 typedef int64 lane_type;
189 explicit v_int64x2(vint64m1_t v) : val(v) {}
190 v_int64x2(int64 v0, int64 v1)
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle_v_i64m1((
long*)v, 2);
197 return vmv_x_s_i64m1_i64(val, 2);
204 typedef double lane_type;
208 explicit v_float64x2(vfloat64m1_t v) : val(v) {}
209 v_float64x2(
double v0,
double v1)
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle_v_f64m1((
double*)v, 2);
216 return vfmv_f_s_f64m1_f64(val, 2);
221#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
223inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
227inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
235OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
237OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
239OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
241OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
243OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
246inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \
247inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
249OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
250OPENCV_HAL_IMPL_RISCVV_INIT_SET(
char, int8, s8, i8, 16)
251OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
252OPENCV_HAL_IMPL_RISCVV_INIT_SET(
short, int16, s16, i16, 8)
253OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned int, uint32, u32, u32, 4)
254OPENCV_HAL_IMPL_RISCVV_INIT_SET(
int, int32, s32, i32, 4)
255OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned long, uint64, u64, u64, 2)
256OPENCV_HAL_IMPL_RISCVV_INIT_SET(
long, int64, s64, i64, 2)
264#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
265inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
267 return _Tpvec(intrin(a.val, b.val)); \
269inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
271 a.val = intrin(a.val, b.val); \
275#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
276inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
278 return _Tpvec(intrin(a.val, b.val, num)); \
280inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
282 a.val = intrin(a.val, b.val, num); \
286OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
287OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint8x16, vssubu_vv_u8m1, 16)
288OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int8x16, vsadd_vv_i8m1, 16)
289OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int8x16, vssub_vv_i8m1, 16)
290OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
291OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint16x8, vssubu_vv_u16m1, 8)
292OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int16x8, vsadd_vv_i16m1, 8)
293OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int16x8, vssub_vv_i16m1, 8)
294OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int32x4, vsadd_vv_i32m1, 4)
295OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int32x4, vssub_vv_i32m1, 4)
296OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_int32x4, vmul_vv_i32m1, 4)
297OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint32x4, vadd_vv_u32m1, 4)
298OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint32x4, vsub_vv_u32m1, 4)
299OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_uint32x4, vmul_vv_u32m1, 4)
300OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int64x2, vsadd_vv_i64m1, 2)
301OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int64x2, vssub_vv_i64m1, 2)
302OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint64x2, vadd_vv_u64m1, 2)
303OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint64x2, vsub_vv_u64m1, 2)
304OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float32x4, vfadd_vv_f32m1, 4)
305OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float32x4, vfsub_vv_f32m1, 4)
306OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float32x4, vfmul_vv_f32m1, 4)
309 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
313 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
317OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float64x2, vfadd_vv_f64m1, 2)
318OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float64x2, vfsub_vv_f64m1, 2)
319OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float64x2, vfmul_vv_f64m1, 2)
322 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
326 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
331#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
332inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
334 return _Tpvec(intrin(a.val, b.val)); \
337#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
338inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
340 return _Tpvec(intrin(a.val, b.val, num)); \
342OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
343OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
344OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
345OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
346OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
347OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
348OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
349OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
350OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
351OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
352OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
353OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
354OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
355OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
356OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
357OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
366 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
371 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
377 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
382 return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
387 return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
392 return v_fma(a, b, c);
397 return v_fma(a, b, c);
404 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);
405 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);
406 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);
407 res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);
415 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);
416 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);
417 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);
418 res = vfadd_vv_f32m1(res, a.val, 4);
429 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
434 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
440 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
445 return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
450 return v_fma(a, b, c);
453#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
454 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
455 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
456 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
457 inline _Tpvec operator ~ (const _Tpvec & a) \
459 return _Tpvec(vnot_v_##suffix(a.val, num)); \
462OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint8x16, u8m1, 16)
463OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint16x8, u16m1, 8)
464OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint32x4, u32m1, 4)
465OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint64x2, u64m1, 2)
466OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int8x16, i8m1, 16)
467OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int16x8, i16m1, 8)
468OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int32x4, i32m1, 4)
469OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int64x2, i64m1, 2)
471#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
472inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
474 return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
476inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
478 a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
482OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
483OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
484OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
488 return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
491#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
492inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
494 return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
496inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
498 a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
502OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
503OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
504OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
508 return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
512 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
516 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
529 vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
530 return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
535 vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
536 return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
541 vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
542 return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
547 return (
v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
552 return (
v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
557 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
563 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
567#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
568inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
569 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
570 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
571 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
574OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
575OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
576OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
580 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
581 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
582 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
585 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
586 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
587 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
590#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
591inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
592 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
593 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
594 return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \
597OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
598OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
599OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
605 vint16m2_t res = vundefined_i16m2();
606 res = vwmul_vv_i16m2(a.val, b.val, 16);
607 c.val = vget_i16m2_i16m1(res, 0);
608 d.val = vget_i16m2_i16m1(res, 1);
614 vuint16m2_t res = vundefined_u16m2();
615 res = vwmulu_vv_u16m2(a.val, b.val, 16);
616 c.val = vget_u16m2_u16m1(res, 0);
617 d.val = vget_u16m2_u16m1(res, 1);
623 vint32m2_t res = vundefined_i32m2();
624 res = vwmul_vv_i32m2(a.val, b.val, 8);
625 c.val = vget_i32m2_i32m1(res, 0);
626 d.val = vget_i32m2_i32m1(res, 1);
632 vuint32m2_t res = vundefined_u32m2();
633 res = vwmulu_vv_u32m2(a.val, b.val, 8);
634 c.val = vget_u32m2_u32m1(res, 0);
635 d.val = vget_u32m2_u32m1(res, 1);
641 vint64m2_t res = vundefined_i64m2();
642 res = vwmul_vv_i64m2(a.val, b.val, 4);
643 c.val = vget_i64m2_i64m1(res, 0);
644 d.val = vget_i64m2_i64m1(res, 1);
650 vuint64m2_t res = vundefined_u64m2();
651 res = vwmulu_vv_u64m2(a.val, b.val, 4);
652 c.val = vget_u64m2_u64m1(res, 0);
653 d.val = vget_u64m2_u64m1(res, 1);
656OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
657OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
658OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
659OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
660OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
661OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
662OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
663OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
664OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
665OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
666OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
667OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
672 vint32m2_t res = vundefined_i32m2();
673 res = vwmul_vv_i32m2(a.val, b.val, 8);
674 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
675 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
679 vint32m2_t res = vundefined_i32m2();
680 res = vwmul_vv_i32m2(a.val, b.val, 8);
681 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
682 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
688 vint64m2_t res = vundefined_i64m2();
689 res = vwmul_vv_i64m2(a.val, b.val, 4);
690 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
691 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
695 vint64m2_t res = vundefined_i64m2();
696 res = vwmul_vv_i64m2(a.val, b.val, 4);
697 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
698 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
704 vuint16m2_t v1 = vundefined_u16m2();
705 vuint32m2_t v2 = vundefined_u32m2();
706 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
707 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
708 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
709 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
715 vuint16m2_t v1 = vundefined_u16m2();
716 vuint32m2_t v2 = vundefined_u32m2();
717 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
718 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
719 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
720 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
725 vint16m2_t v1 = vundefined_i16m2();
726 vint32m2_t v2 = vundefined_i32m2();
727 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
728 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
729 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
730 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
736 vint16m2_t v1 = vundefined_i16m2();
737 vint32m2_t v2 = vundefined_i32m2();
738 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
739 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
740 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
741 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
746 vuint32m2_t v1 = vundefined_u32m2();
747 vuint64m2_t v2 = vundefined_u64m2();
748 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
749 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
750 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
751 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
757 vuint32m2_t v1 = vundefined_u32m2();
758 vuint64m2_t v2 = vundefined_u64m2();
759 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
760 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
761 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
762 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
767 vint32m2_t v1 = vundefined_i32m2();
768 vint64m2_t v2 = vundefined_i64m2();
769 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
770 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
771 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
772 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
778 vint32m2_t v1 = vundefined_i32m2();
779 vint64m2_t v2 = vundefined_i64m2();
780 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
781 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
782 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
783 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
790 vint32m2_t v1 = vundefined_i32m2();
791 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
792 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
797 vint32m2_t v1 = vundefined_i32m2();
798 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
799 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
805 vint64m2_t v1 = vundefined_i64m2();
806 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
807 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
811 vint64m2_t v1 = vundefined_i64m2();
812 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
813 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
819 vuint16m2_t v1 = vundefined_u16m2();
820 vuint32m2_t v2 = vundefined_u32m2();
821 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
822 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
823 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
828 vuint16m2_t v1 = vundefined_u16m2();
829 vuint32m2_t v2 = vundefined_u32m2();
830 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
831 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
832 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
837 vint16m2_t v1 = vundefined_i16m2();
838 vint32m2_t v2 = vundefined_i32m2();
839 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
840 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
841 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
845 vint16m2_t v1 = vundefined_i16m2();
846 vint32m2_t v2 = vundefined_i32m2();
847 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
848 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
849 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
855 vuint32m2_t v1 = vundefined_u32m2();
856 vuint64m2_t v2 = vundefined_u64m2();
857 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
858 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
859 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
863 vuint32m2_t v1 = vundefined_u32m2();
864 vuint64m2_t v2 = vundefined_u64m2();
865 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
866 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
867 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
872 vint32m2_t v1 = vundefined_i32m2();
873 vint64m2_t v2 = vundefined_i64m2();
874 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
875 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
876 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
880 vint32m2_t v1 = vundefined_i32m2();
881 vint64m2_t v2 = vundefined_i64m2();
882 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
883 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
884 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
888#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
889inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
891 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
892 val = intrin(val, a.val, val, num); \
893 return vmv_x_s_##len##m1_##len(val, num); \
897#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
898inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
900 v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
901 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
904OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16,
int, sum, vwredsum_vs_i8m1_i16m1, 16)
905OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32,
int, sum, vwredsum_vs_i16m1_i32m1, 8)
906OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64,
int, sum, vwredsum_vs_i32m1_i64m1, 4)
907OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16,
unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
908OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32,
unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
909OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64,
unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
912 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
913 val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \
914 return vfmv_f_s_f32m1_f32(val, 4); \
918 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
919 val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \
920 return vfmv_f_s_f64m1_f64(val, 2); \
923{
return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
926{
return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
928#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
929OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \
930OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \
931OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \
932OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \
933OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \
934OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \
935OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \
936OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
937OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
max)
938OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
min)
943 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
944 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
945 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
946 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
947 a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
948 b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
949 c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
950 d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
956 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
957 vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
958 vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
959 vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
960 a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
964#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
965inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
966 _Tpvec2 x = v_absdiff(a, b); \
967 return v_reduce_sum(x); \
977#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
978inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
980 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
981 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
983inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
986 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
988inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
990 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
991 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
993inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
995 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
996 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
998inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1000 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1001 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1003inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1005 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1006 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1009OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int8x16, i8m1, 8, 16, _vv_)
1010OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int16x8, i16m1, 16, 8, _vv_)
1011OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int32x4, i32m1, 32, 4, _vv_)
1012OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int64x2, i64m1, 64, 2, _vv_)
1013OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint8x16, u8m1, 8, 16, u_vv_)
1014OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint16x8, u16m1, 16, 8, u_vv_)
1015OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint32x4, u32m1, 32, 4, u_vv_)
1016OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint64x2, u64m1, 64, 2, u_vv_)
1021 vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1022 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1027 vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1028 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1033 vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1034 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1039 vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1040 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1045 vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1046 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1051 vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1052 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1057 vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
1058 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1065 vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1066 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1071 vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1072 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1077 vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1078 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1083 vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1084 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1089 vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1090 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1095 vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1096 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1101 vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
1102 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1105#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1106inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1107 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1108 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1109 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1111 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1112 val = vset_##_T##m4(val, 0, a0.val); \
1113 val = vset_##_T##m4(val, 1, a1.val); \
1114 val = vset_##_T##m4(val, 2, a2.val); \
1115 val = vset_##_T##m4(val, 3, a3.val); \
1116 val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \
1117 b0.val = vget_##_T##m4_##_T##m1(val, 0); \
1118 b1.val = vget_##_T##m4_##_T##m1(val, 1); \
1119 b2.val = vget_##_T##m4_##_T##m1(val, 2); \
1120 b3.val = vget_##_T##m4_##_T##m1(val, 3); \
1122OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1123OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
int, i32)
1124OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
float, f32)
1127#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1128inline _Tpvec operator << (const _Tpvec& a, int n) \
1129{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1130template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1131{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1133#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1134inline _Tpvec operator >> (const _Tpvec& a, int n) \
1135{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1136template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1137{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1138template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1139{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1142#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1143OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1144OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1146OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1147OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1148OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1149OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1150OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1151OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1152OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1153OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1156#define VUP4(n) {0, 1, 2, 3}
1157#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1158#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1159#define VUP2(n) {0, 1}
1161#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1162template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1164 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1165 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1166 return _Tpvec(tmp);\
1168template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1170 return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
1172template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1174template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1176 suffix##m2_t tmp = vundefined_##_T##m2(); \
1177 tmp = vset_##_T##m2(tmp, 0, a.val); \
1178 tmp = vset_##_T##m2(tmp, 1, b.val); \
1179 tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
1180 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
1182template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1184 suffix##m2_t tmp = vundefined_##_T##m2(); \
1185 tmp = vset_##_T##m2(tmp, 0, b.val); \
1186 tmp = vset_##_T##m2(tmp, 1, a.val); \
1187 tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
1188 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
1190template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1192 CV_UNUSED(b); return a; \
1195OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1196OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1197OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1198OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1199OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1200OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1201OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1202OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1203OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1204OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1206#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
1207inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1209 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1210 vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
1211 return _Tpvec(_Tp2##_t(tmp)); } \
1212inline _Tpvec v_load_low(const _Tp* ptr) \
1213{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
1214inline _Tpvec v_load_aligned(const _Tp* ptr) \
1215{ return _Tpvec(vle_v_##len(ptr, num)); } \
1216inline _Tpvec v_load(const _Tp* ptr) \
1217{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
1218inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1219{ vse_v_##len(ptr, a.val, hnum);}\
1220inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1222 _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \
1223 vse_v_##len(ptr, a0, hnum);}\
1224inline void v_store(_Tp* ptr, const _Tpvec& a) \
1225{ vse_v_##len(ptr, a.val, num); } \
1226inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1227{ vse_v_##len(ptr, a.val, num); } \
1228inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1229{ vse_v_##len(ptr, a.val, num); } \
1230inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1231{ vse_v_##len(ptr, a.val, num); }
1233OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
1234OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16, schar, vint8m1, i8m1, 8, 16)
1235OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
1236OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8)
1237OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4)
1238OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4)
1239OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned long, vuint64m1, u64m1, 1, 2)
1240OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2)
1241OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4)
1242OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2)
1247inline
v_int8x16 v_lut(const schar* tab, const
int* idx)
1250 schar CV_DECL_ALIGNED(32) elems[16] =
1269 return v_int8x16(vle_v_i8m1(elems, 16));
1271 int32xm4_t index32 = vlev_int32xm4(idx, 16);
1272 vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
1273 vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
1274 return v_int8x16(vlxbv_i8m1(tab, index, 16));
1278inline v_int8x16 v_lut_pairs(
const schar* tab,
const int* idx){
1279 schar CV_DECL_ALIGNED(32) elems[16] =
1298 return v_int8x16(vle_v_i8m1(elems, 16));
1300inline v_int8x16 v_lut_quads(
const schar* tab,
const int* idx)
1302 schar CV_DECL_ALIGNED(32) elems[16] =
1321 return v_int8x16(vle_v_i8m1(elems, 16));
1324inline v_uint8x16 v_lut(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1325inline v_uint8x16 v_lut_pairs(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1326inline v_uint8x16 v_lut_quads(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1328inline v_int16x8 v_lut(
const short* tab,
const int* idx)
1330 short CV_DECL_ALIGNED(32) elems[8] =
1341 return v_int16x8(vle_v_i16m1(elems, 8));
1343inline v_int16x8 v_lut_pairs(
const short* tab,
const int* idx)
1345 short CV_DECL_ALIGNED(32) elems[8] =
1356 return v_int16x8(vle_v_i16m1(elems, 8));
1358inline v_int16x8 v_lut_quads(
const short* tab,
const int* idx)
1360 short CV_DECL_ALIGNED(32) elems[8] =
1371 return v_int16x8(vle_v_i16m1(elems, 8));
1373inline v_uint16x8 v_lut(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut((
short*)tab, idx)); }
1374inline v_uint16x8 v_lut_pairs(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut_pairs((
short*)tab, idx)); }
1375inline v_uint16x8 v_lut_quads(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut_quads((
short*)tab, idx)); }
1377inline v_int32x4 v_lut(
const int* tab,
const int* idx)
1379 int CV_DECL_ALIGNED(32) elems[4] =
1386 return v_int32x4(vle_v_i32m1(elems, 4));
1388inline v_int32x4 v_lut_pairs(
const int* tab,
const int* idx)
1390 int CV_DECL_ALIGNED(32) elems[4] =
1397 return v_int32x4(vle_v_i32m1(elems, 4));
1399inline v_int32x4 v_lut_quads(
const int* tab,
const int* idx)
1401 return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
1403inline v_uint32x4 v_lut(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut((
int*)tab, idx)); }
1404inline v_uint32x4 v_lut_pairs(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut_pairs((
int*)tab, idx)); }
1405inline v_uint32x4 v_lut_quads(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut_quads((
int*)tab, idx)); }
1407inline v_int64x2 v_lut(
const int64_t* tab,
const int* idx)
1409 vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1412inline v_int64x2 v_lut_pairs(
const int64_t* tab,
const int* idx)
1414 return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
1417inline v_uint64x2 v_lut(
const uint64_t* tab,
const int* idx)
1419 vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1422inline v_uint64x2 v_lut_pairs(
const uint64_t* tab,
const int* idx)
1424 return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
1427inline v_float32x4 v_lut(
const float* tab,
const int* idx)
1429 float CV_DECL_ALIGNED(32) elems[4] =
1438inline v_float32x4 v_lut_pairs(
const float* tab,
const int* idx)
1440 float CV_DECL_ALIGNED(32) elems[4] =
1449inline v_float32x4 v_lut_quads(
const float* tab,
const int* idx)
1453inline v_float64x2 v_lut(
const double* tab,
const int* idx)
1455 vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1458inline v_float64x2 v_lut_pairs(
const double* tab,
const int* idx)
1465 int CV_DECL_ALIGNED(32) elems[4] =
1472 return v_int32x4(vle_v_i32m1(elems, 4));
1477 unsigned CV_DECL_ALIGNED(32) elems[4] =
1489 float CV_DECL_ALIGNED(32) elems[4] =
1500 vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1505 vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
1506 vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1508 x.val = vlxe_v_f32m1(tab, index_x, 4);
1509 y.val = vlxe_v_f32m1(tab, index_y, 4);
1514 int CV_DECL_ALIGNED(32) idx[4];
1521#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
1522inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1524 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1525 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1526 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1527 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1529template<int n> inline \
1530v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1532 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1533 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1534 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1535 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1537inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1539 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1540 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1541 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1542 asm("" ::: "memory"); \
1543 vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1545template<int n> inline \
1546void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1548 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1549 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1550 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1551 vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1553OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx,
signed char)
1554OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx,
signed short)
1555OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx,
int)
1556OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx,
unsigned char)
1557OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx,
unsigned short)
1558OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx,
unsigned int)
1563 vuint16m2_t tmp = vundefined_u16m2(); \
1564 tmp = vset_u16m2(tmp, 0, a.val); \
1565 tmp = vset_u16m2(tmp, 1, b.val); \
1566 return
v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
1572 vuint32m4_t vabcd = vundefined_u32m4(); \
1573 vuint16m2_t v16 = vundefined_u16m2(); \
1574 vabcd = vset_u32m4(vabcd, 0, a.val); \
1575 vabcd = vset_u32m4(vabcd, 1, b.val); \
1576 vabcd = vset_u32m4(vabcd, 2, c.val); \
1577 vabcd = vset_u32m4(vabcd, 3, d.val); \
1578 v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
1579 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1586 vuint64m8_t v64 = vundefined_u64m8(); \
1587 vuint32m4_t v32 = vundefined_u32m4(); \
1588 vuint16m2_t v16 = vundefined_u16m2(); \
1589 v64 = vset_u64m8(v64, 0, a.val); \
1590 v64 = vset_u64m8(v64, 1, b.val); \
1591 v64 = vset_u64m8(v64, 2, c.val); \
1592 v64 = vset_u64m8(v64, 3, d.val); \
1593 v64 = vset_u64m8(v64, 4, e.val); \
1594 v64 = vset_u64m8(v64, 5, f.val); \
1595 v64 = vset_u64m8(v64, 6, g.val); \
1596 v64 = vset_u64m8(v64, 7, h.val); \
1597 v32 = vnsrl_vx_u32m4(v64, 0, 16);
1598 v16 = vnsrl_vx_u16m2(v32, 0, 16);
1599 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1611#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
1612inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1614 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1615 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1616 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1617 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1618 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \
1620inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1622 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1623 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1624 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1625 return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \
1627template<int n> inline \
1628v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1630 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1631 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1632 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1633 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1634 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \
1636template<int n> inline \
1637void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1639 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1640 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1641 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1642 vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \
1643 return vse_v_u##tp1##m1(ptr, val, num2);\
1645OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8,
unsigned char )
1646OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4,
unsigned short)
1649#pragma GCC diagnostic push
1650#pragma GCC diagnostic ignored "-Wuninitialized"
1654#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \
1655 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
1658 v_mul_expand(a, b, c, d); \
1659 return v_pack(c, d); \
1661 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
1662 { a = a * b; return a; }
1670#pragma GCC diagnostic pop
1672static const signed char popCountTable[256] =
1674 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1675 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1676 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1677 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1678 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1679 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1680 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1681 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1682 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1683 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1684 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1685 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1686 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1687 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1688 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1689 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1692inline vuint8m1_t vcnt_u8(vuint8m1_t val){
1693 vuint8m1_t v0 = val & 1;
1694 return vlxe_v_u8m1((
unsigned char*)popCountTable, val >> 1, 16)+v0;
1706 return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
1712 vuint8m2_t tmp = vundefined_u8m2();
1713 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1714 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1715 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1716 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1723 vuint8m2_t tmp = vundefined_u8m2();
1724 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1725 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1726 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1727 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1734 vuint8m2_t tmp = vundefined_u8m2();
1735 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1736 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1737 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1738 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1739 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1740 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1747 vuint8m2_t tmp = vundefined_u8m2();
1748 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1749 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1750 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1751 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1752 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1753 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1760 vuint8m2_t tmp = vundefined_u8m2();
1761 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1762 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1763 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1764 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1765 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1766 vuint8m1_t res1 = zero;
1767 vuint8m1_t res2 = zero;
1768 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1769 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1771 return v_uint64x2((
unsigned long)vmv_x_s_u8m1_u8(res1, 8), (
unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1777 vuint8m2_t tmp = vundefined_u8m2();
1778 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1779 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1780 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1781 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1782 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1783 vuint8m1_t res1 = zero;
1784 vuint8m1_t res2 = zero;
1785 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1786 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1788 return v_uint64x2((
unsigned long)vmv_x_s_u8m1_u8(res1, 8), (
unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1791#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
1794 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
1795 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1796 vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
1797 vuint32m1_t res = vmv_v_x_u32m1(0, 4);
1798 vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
1799 res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
1800 res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
1801 return vmv_x_s_u32m1_u32(res, 8);
1805 vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
1806 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1807 vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
1808 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1809 vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
1810 res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
1811 res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
1812 return vmv_x_s_i32m1_i32(res, 8);
1817 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1818 vint16m1_t m1 = (vint16m1_t){SMASK};
1819 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1820 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1821 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1822 return vmv_x_s_i16m1_i16(res, 8);
1826 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1827 vint16m1_t m1 = (vint16m1_t){SMASK};
1828 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1829 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1830 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1831 return vmv_x_s_i16m1_i16(res, 8);
1835 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1836 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1837 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1838 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1839 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1840 return vmv_x_s_i32m1_i32(res, 4);
1844 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
1845 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1846 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1847 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1848 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1849 return vmv_x_s_i32m1_i32(res, 4);
1853 vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
1854 int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
1858{
return v_signmask(v_reinterpret_as_u64(a)); }
1860{
return v_signmask(v_reinterpret_as_u64(a)); }
1863 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1864 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1865 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1866 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1867 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1868 return vmv_x_s_i32m1_i32(res, 4);
1874else return trailingZeros32(val); }
1878else return trailingZeros32(val); }
1882else return trailingZeros32(val); }
1886else return trailingZeros32(val); }
1890else return trailingZeros32(val); }
1894else return trailingZeros32(val); }
1898else return trailingZeros32(val); }
1902else return trailingZeros32(val); }
1906else return trailingZeros32(val); }
1908#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
1909inline bool v_check_all(const v_##_Tpvec& a) \
1911 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
1912 vuint64m1_t v1 = vuint64m1_t(v0); \
1913 return (v1[0] | v1[1]) == 0; \
1915inline bool v_check_any(const v_##_Tpvec& a) \
1917 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
1918 vuint64m1_t v1 = vuint64m1_t(v0); \
1919 return (v1[0] | v1[1]) != 0; \
1922OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16)
1923OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
1924OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
1925OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
1953#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
1954inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1956 return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
1959OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int8x16, i8m1, vbool8_t, 16)
1960OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int16x8, i16m1, vbool16_t, 8)
1961OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int32x4, i32m1, vbool32_t, 4)
1962OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint8x16, u8m1, vbool8_t, 16)
1963OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint16x8, u16m1, vbool16_t, 8)
1964OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint32x4, u32m1, vbool32_t, 4)
1967 return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
1971 return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
1974#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
1975inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
1977 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1978 b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \
1979 b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \
1981inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
1983 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \
1984 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1986inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
1988 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1989 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
1991inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
1993 _T2##_t val = vle##_v_##_Tp1(ptr, num2); \
1994 _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \
1995 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1998OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
1999OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1)
2000OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1)
2001OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1)
2002OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int16x8, int32x4,
short, i16m1, 8, i32, 4, vint32m2, vint16m1)
2003OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int32x4, int64x2,
int, i32m1, 4, i64, 2, vint64m2, vint32m1)
2007 vuint16m2_t b = vundefined_u16m2();
2008 vuint32m2_t c = vundefined_u32m2();
2009 vuint8m1_t val = vle_v_u8m1(ptr, 4); \
2010 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2011 c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2017 vint16m2_t b = vundefined_i16m2();
2018 vint32m2_t c = vundefined_i32m2();
2019 vint8m1_t val = vle_v_i8m1(ptr, 4); \
2020 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2021 c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2022 return
v_int32x4(vget_i32m2_i32m1(c, 0));
2024#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
2025#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
2026#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
2027#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
2028#define LOW_4 0x0000000100000000, 0x0000000500000004
2029#define LOW_8 0x0003000200010000, 0x000B000A00090008
2030#define LOW_16 0x0706050403020100, 0x1716151413121110
2031#define HIGH_4 0x0000000300000002, 0x0000000700000006
2032#define HIGH_8 0x0007000600050004, 0x000F000E000D000C
2033#define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918
2034#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
2035inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2037 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2038 tmp = vset_##_T##m2(tmp, 0, a0.val); \
2039 tmp = vset_##_T##m2(tmp, 1, a1.val); \
2040 vuint64m2_t mask = VITL_##num; \
2041 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
2042 b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
2043 b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
2045inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2047 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2048 return v_##_Tpvec(b0);\
2050inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2052 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2053 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2054 v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2055 return v_##_Tpvec(b1);\
2057inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2059 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2060 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2061 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2062 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2065OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
2066OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
2067OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
2068OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
2069OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
2070OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
2071OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
2072OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
2076 vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2077 return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
2081 vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2082 return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
2087 vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
2088 return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
2093 vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
2094 return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
2098 return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2103 return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2107{
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
2124#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2126inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2127{ return v_rotate_right<n>(a, b);}
2128OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint8x16, u8, 0)
2129OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int8x16, s8, 0)
2130OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint16x8, u16, 1)
2131OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int16x8, s16, 1)
2132OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint32x4, u32, 2)
2133OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int32x4, s32, 2)
2134OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint64x2, u64, 3)
2135OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int64x2, s64, 3)
2136OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float32x4, f32, 2)
2137OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float64x2, f64, 3)
2140#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
2141template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
2143OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint8x16, uchar, u8)
2144OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int8x16, schar, s8)
2145OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint16x8, ushort, u16)
2146OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int16x8,
short, s16)
2147OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint32x4, uint, u32)
2148OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int32x4,
int, s32)
2149OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint64x2, uint64, u64)
2150OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int64x2, int64, s64)
2151OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float32x4,
float, f32)
2152OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float64x2,
double, f64)
2154#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2155template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2157OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint8x16, u8, 16)
2158OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int8x16, i8, 16)
2159OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint16x8, u16, 8)
2160OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int16x8, i16, 8)
2161OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint32x4, u32, 4)
2162OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int32x4, i32, 4)
2163OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint64x2, u64, 2)
2164OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int64x2, i64, 2)
2165OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_float32x4, f32, 4)
2168 __builtin_riscv_fsrm(0);
2169 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2170 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2171 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2172 __builtin_riscv_fsrm(0);
2177 __builtin_riscv_fsrm(2);
2178 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2179 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2180 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2181 __builtin_riscv_fsrm(0);
2187 __builtin_riscv_fsrm(3);
2188 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2189 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2190 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2191 __builtin_riscv_fsrm(0);
2197 __builtin_riscv_fsrm(1);
2198 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2199 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2200 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2201 __builtin_riscv_fsrm(0);
2207 __builtin_riscv_fsrm(0);
2208 vfloat64m2_t _val = vundefined_f64m2();
2209 _val = vset_f64m2(_val, 0, a.val);
2211 _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2212 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2213 __builtin_riscv_fsrm(0);
2218 __builtin_riscv_fsrm(0);
2219 vfloat64m2_t _val = vundefined_f64m2();
2220 _val = vset_f64m2(_val, 0, a.val);
2221 _val = vset_f64m2(_val, 1, b.val);
2222 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2223 __builtin_riscv_fsrm(0);
2228 __builtin_riscv_fsrm(2);
2229 vfloat64m2_t _val = vundefined_f64m2();
2230 _val = vset_f64m2(_val, 0, a.val);
2231 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2233 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2234 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2235 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2236 __builtin_riscv_fsrm(0);
2242 __builtin_riscv_fsrm(3);
2243 vfloat64m2_t _val = vundefined_f64m2();
2244 _val = vset_f64m2(_val, 0, a.val);
2245 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2247 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2248 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2249 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2250 __builtin_riscv_fsrm(0);
2256 __builtin_riscv_fsrm(1);
2257 vfloat64m2_t _val = vundefined_f64m2();
2258 _val = vset_f64m2(_val, 0, a.val);
2259 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2261 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2262 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2263 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2264 __builtin_riscv_fsrm(0);
2268#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2269inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2271 v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
2272 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2273 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2275inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2277 v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
2278 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2279 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2280 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2282inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2283 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2285 v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
2286 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2287 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2288 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2289 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2292#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2293inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2294 hal::StoreMode =hal::STORE_UNALIGNED) \
2296 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2297 ret = vset_##_T##m1x2(ret, 0, a.val); \
2298 ret = vset_##_T##m1x2(ret, 1, b.val); \
2299 intrin##2e_v_##_T##m1x2(ptr, ret, num); \
2301inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2302 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2304 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2305 ret = vset_##_T##m1x3(ret, 0, a.val); \
2306 ret = vset_##_T##m1x3(ret, 1, b.val); \
2307 ret = vset_##_T##m1x3(ret, 2, c.val); \
2308 intrin##3e_v_##_T##m1x3(ptr, ret, num); \
2310inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2311 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2312 hal::StoreMode =hal::STORE_UNALIGNED ) \
2314 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2315 ret = vset_##_T##m1x4(ret, 0, a.val); \
2316 ret = vset_##_T##m1x4(ret, 1, b.val); \
2317 ret = vset_##_T##m1x4(ret, 2, c.val); \
2318 ret = vset_##_T##m1x4(ret, 3, d.val); \
2319 intrin##4e_v_##_T##m1x4(ptr, ret, num); \
2322#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
2323OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \
2324OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
2327OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
2328OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16,
short, 8, vlseg, vsseg, i16)
2329OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32,
int, 4, vlseg, vsseg, i32)
2331OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8,
unsigned char, 16, vlseg, vsseg, u8)
2332OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16,
unsigned short, 8, vlseg, vsseg, u16)
2333OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32,
unsigned int, 4, vlseg, vsseg, u32)
2335#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
2336inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2338 v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
2339 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2340 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2342inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2344 v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \
2345 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2346 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2347 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2349inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2350 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2352 v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \
2353 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2354 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2355 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2356 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2358inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2359 hal::StoreMode =hal::STORE_UNALIGNED) \
2361 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2362 ret = vset_##_T##m1x2(ret, 0, a.val); \
2363 ret = vset_##_T##m1x2(ret, 1, b.val); \
2364 vsseg2e_v_##_T##m1x2(ptr, ret, num); \
2366inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2367 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2369 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2370 ret = vset_##_T##m1x3(ret, 0, a.val); \
2371 ret = vset_##_T##m1x3(ret, 1, b.val); \
2372 ret = vset_##_T##m1x3(ret, 2, c.val); \
2373 vsseg3e_v_##_T##m1x3(ptr, ret, num); \
2375inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2376 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2377 hal::StoreMode =hal::STORE_UNALIGNED ) \
2379 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2380 ret = vset_##_T##m1x4(ret, 0, a.val); \
2381 ret = vset_##_T##m1x4(ret, 1, b.val); \
2382 ret = vset_##_T##m1x4(ret, 2, c.val); \
2383 ret = vset_##_T##m1x4(ret, 3, d.val); \
2384 vsseg4e_v_##_T##m1x4(ptr, ret, num); \
2386OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32,
float, 4, f32)
2387OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64,
double, 2, f64)
2389OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64,
unsigned long, 2, u64)
2390OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64,
long, 2, i64)
2400 vfloat64m2_t _val = vundefined_f64m2();
2401 _val = vset_f64m2(_val, 0, a.val);
2402 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2408 vfloat64m2_t _val = vundefined_f64m2();
2409 _val = vset_f64m2(_val, 0, a.val);
2410 _val = vset_f64m2(_val, 1, b.val);
2411 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
2417 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2418 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2424 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2425 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2431 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2437 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2449 vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2450 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2454 return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2459 vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2460 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2464 return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2469 vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2470 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
2472inline v_uint16x8 v_interleave_pairs(
const v_uint16x8& vec) {
return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2475 vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2476 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2478inline v_uint16x8 v_interleave_quads(
const v_uint16x8& vec) {
return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2482 vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2483 return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2485inline v_uint32x4 v_interleave_pairs(
const v_uint32x4& vec) {
return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2486inline v_float32x4 v_interleave_pairs(
const v_float32x4& vec) {
return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2489 vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2490 return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2492inline v_uint8x16 v_pack_triplets(
const v_uint8x16& vec) {
return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2496 vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2497 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2499inline v_uint16x8 v_pack_triplets(
const v_uint16x8& vec) {
return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2513 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2514 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
2524 vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
2525 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2529inline void v_pack_store(float16_t* ptr,
const v_float32x4& v)
2531 vfloat32m2_t v32 = vundefined_f32m2();
2532 v32 = vset_f32m2(v32, 0, v.val);
2533 vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
2534 vse_v_f16m1((__fp16*)ptr, hv, 4);
2538inline void v_cleanup() {}
2540CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1454
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75