8#ifndef OPENCV_HAL_INTRIN_RVV_HPP
9#define OPENCV_HAL_INTRIN_RVV_HPP
16CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
19#define CV_SIMD128_64F 1
29 for (
int i = 0; i < 8; ++i)
41 for (
int i = 0; i < 8; ++i)
53 for (
int i = 0; i < 4; ++i)
65 for (
int i = 0; i < 4; ++i)
73 unsigned val[2] = {0};
134 for (
int i = 0; i < 4; ++i)
146 for (
int i = 0; i < 4; ++i)
153#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
154inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
156 return _Tpvec(ptr); \
158inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
160 for (int i = 0; i < n; ++i) \
166OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vuint8mf2_t, uint8_t, u8, 8, 8)
167OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vint8mf2_t, int8_t, i8, 8, 8)
168OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vuint16mf2_t, uint16_t, u16, 16, 4)
169OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vint16mf2_t, int16_t, i16, 16, 4)
170OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vuint32mf2_t, uint32_t, u32, 32, 2)
171OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vint32mf2_t, int32_t, i32, 32, 2)
172OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vfloat32mf2_t, float32_t, f32, 32, 2)
173OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vuint64mf2_t, uint64_t, u64, 64, 1)
174OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vint64mf2_t, int64_t, i64, 64, 1)
175OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(
vfloat64mf2_t, float64_t, f64, 64, 1)
178#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
179inline _Tpwvec wcvt (_Tpvec v) \
182 for (int i = 0; i < n; ++i) \
184 tmp[i] = (_wTp)v.val[i]; \
186 vsetvlmax_e##width##m1(); \
187 return vle##width##_v_##suffix##m1(tmp); \
190OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t,
vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
191OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t,
vint8mf2_t,
short, vwcvt_x_x_v_i16m1, i16, 16, 8)
192OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t,
vuint16mf2_t,
unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
193OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t,
vint16mf2_t,
int, vwcvt_x_x_v_i32m1, i32, 32, 4)
194OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t,
vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
195OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t,
vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
197inline
vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
201inline vint8mf4_t vle8_v_i8mf4 (
const int8_t *base)
203 return vint8mf4_t(base);
206inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
209 for (
int i = 0; i < 4; ++i)
211 tmp[i] = (ushort)src.val[i];
213 return vle16_v_u16mf2(tmp);
215inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
218 for (
int i = 0; i < 4; ++i)
220 tmp[i] = (short)src.val[i];
222 return vle16_v_i16mf2(tmp);
229 typedef uchar lane_type;
230 enum { nlanes = 16 };
238 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
239 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
241 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
242 for (
int i = 0; i < nlanes; ++i)
247 operator vuint8m1_t()
const
250 return vle8_v_u8m1(val);
262 typedef schar lane_type;
263 enum { nlanes = 16 };
271 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
272 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
274 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
275 for (
int i = 0; i < nlanes; ++i)
280 operator vint8m1_t()
const
283 return vle8_v_i8m1(val);
295 typedef ushort lane_type;
302 vse16_v_u16m1(val, v);
304 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
306 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
307 for (
int i = 0; i < nlanes; ++i)
312 operator vuint16m1_t()
const
315 return vle16_v_u16m1(val);
327 typedef short lane_type;
334 vse16_v_i16m1(val, v);
336 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
338 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
339 for (
int i = 0; i < nlanes; ++i)
344 operator vint16m1_t()
const
347 return vle16_v_i16m1(val);
359 typedef unsigned lane_type;
366 vse32_v_u32m1(val, v);
368 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
370 unsigned v[] = {v0, v1, v2, v3};
371 for (
int i = 0; i < nlanes; ++i)
376 operator vuint32m1_t()
const
379 return vle32_v_u32m1(val);
381 unsigned get0()
const
391 typedef int lane_type;
398 vse32_v_i32m1(val, v);
400 v_int32x4(
int v0,
int v1,
int v2,
int v3)
402 int v[] = {v0, v1, v2, v3};
403 for (
int i = 0; i < nlanes; ++i)
408 operator vint32m1_t()
const
411 return vle32_v_i32m1(val);
422 typedef float lane_type;
429 vse32_v_f32m1(val, v);
431 v_float32x4(
float v0,
float v1,
float v2,
float v3)
433 float v[] = {v0, v1, v2, v3};
434 for (
int i = 0; i < nlanes; ++i)
439 operator vfloat32m1_t()
const
442 return vle32_v_f32m1(val);
453 typedef uint64 lane_type;
460 vse64_v_u64m1(val, v);
464 uint64 v[] = {v0, v1};
465 for (
int i = 0; i < nlanes; ++i)
470 operator vuint64m1_t()
const
473 return vle64_v_u64m1(val);
485 typedef int64 lane_type;
492 vse64_v_i64m1(val, v);
496 int64 v[] = {v0, v1};
497 for (
int i = 0; i < nlanes; ++i)
502 operator vint64m1_t()
const
505 return vle64_v_i64m1(val);
518 typedef double lane_type;
525 vse64_v_f64m1(val, v);
529 double v[] = {v0, v1};
530 for (
int i = 0; i < nlanes; ++i)
535 operator vfloat64m1_t()
const
538 return vle64_v_f64m1(val);
552#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
553inline v_##_Tpvec v_setzero_##suffix1() \
555 vsetvlmax_e##width##m1(); \
556 return v_##_Tpvec(vzero_##suffix2##m1()); \
558inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
560 vsetvlmax_e##width##m1(); \
561 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
564OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
565OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
566OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
567OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8,
short, 16, s16, i16)
568OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4,
unsigned, 32, u32, u32)
569OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4,
int, 32, s32, i32)
570OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
571OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
573#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
574inline v_##_Tpv v_setzero_##suffix() \
576 vsetvlmax_e##width##m1(); \
577 return v_##_Tpv(vzero_##suffix##m1()); \
579inline v_##_Tpv v_setall_##suffix(_Tp v) \
581 vsetvlmax_e##width##m1(); \
582 return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
585OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4,
float, 32, f32)
587OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2,
double, 64, f64)
592#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
593inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
595OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
596OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
597OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
598OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
599OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
600OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
601OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
602OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
603OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
605OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
608#define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
609inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
611 vsetvlmax_e##width2##m1(); \
612 return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
614inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
616 vsetvlmax_e##width1##m1(); \
617 return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
620OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
621OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
622OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
623OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
624OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
625OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
626OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
627OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
628OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
629OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
630OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
631OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
632OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
633OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
634OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
635OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
636OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
637OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
638OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
639OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
640OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
641OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
642OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
643OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
644OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
645OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
646OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
647OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
648OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
649OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
650OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
651OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
652OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
653OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
654OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
655OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
657OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
658OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
659OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
660OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
661OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
662OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
663OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
664OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
665OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
670#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
672inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
674 vsetvlmax_e##width##m1(); \
675 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
677template<int i> inline _Tp v_extract_n(_Tpvec v) \
679 vsetvlmax_e##width##m1(); \
680 return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
684OPENCV_HAL_IMPL_RVV_EXTRACT(
v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
685OPENCV_HAL_IMPL_RVV_EXTRACT(
v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
686OPENCV_HAL_IMPL_RVV_EXTRACT(
v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
687OPENCV_HAL_IMPL_RVV_EXTRACT(
v_int16x8,
short, i16, 16, vmv_x_s_i16m1_i16)
688OPENCV_HAL_IMPL_RVV_EXTRACT(
v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
689OPENCV_HAL_IMPL_RVV_EXTRACT(
v_int32x4,
int, i32, 32, vmv_x_s_i32m1_i32)
690OPENCV_HAL_IMPL_RVV_EXTRACT(
v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
691OPENCV_HAL_IMPL_RVV_EXTRACT(
v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
692OPENCV_HAL_IMPL_RVV_EXTRACT(
v_float32x4,
float, f32, 32, vfmv_f_s_f32m1_f32)
694OPENCV_HAL_IMPL_RVV_EXTRACT(
v_float64x2,
double, f64, 64, vfmv_f_s_f64m1_f64)
699#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
700inline _Tpvec v_load(const _Tp* ptr) \
703 return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
705inline _Tpvec v_load_aligned(const _Tp* ptr) \
707 vsetvlmax_e##width##m1(); \
708 return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
710inline _Tpvec v_load_low(const _Tp* ptr) \
712 vsetvl_e##width##m1(hvl); \
713 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
714 vsetvlmax_e##width##m1(); \
717inline void v_store(_Tp* ptr, const _Tpvec& a) \
720 vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
722inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
724 vsetvlmax_e##width##m1(); \
725 vse##width##_v_##suffix##m1(ptr, a); \
727inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
729 vsetvlmax_e##width##m1(); \
730 vse##width##_v_##suffix##m1(ptr, a); \
732inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
734 vsetvlmax_e##width##m1(); \
735 vse##width##_v_##suffix##m1(ptr, a); \
737inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
739 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
740 vsetvlmax_e##width##m1(); \
741 vse##width##_v_##suffix##m1(tmp_ptr, a); \
742 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
744 ptr[i] = tmp_ptr[i]; \
747inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
749 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
750 vsetvlmax_e##width##m1(); \
751 vse##width##_v_##suffix##m1(tmp_ptr, a); \
752 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
754 ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
758OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
759OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int8x16, vint8m1_t, schar, 8, 8, i8)
760OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
761OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int16x8, vint16m1_t,
short, 4, 16, i16)
762OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint32x4, vuint32m1_t,
unsigned, 2, 32, u32)
763OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int32x4, vint32m1_t,
int, 2, 32, i32)
764OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
765OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int64x2, vint64m1_t, int64, 1, 64, i64)
766OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float32x4, vfloat32m1_t,
float, 2, 32, f32)
768OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float64x2, vfloat64m1_t,
double, 1, 64, f64)
773 schar CV_DECL_ALIGNED(32) elems[16] =
775 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
776 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
785 short CV_DECL_ALIGNED(32) elems[8] =
787 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
796 int CV_DECL_ALIGNED(32) elems[4] =
798 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
805 float CV_DECL_ALIGNED(32) elems[4] =
807 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
816 int64 CV_DECL_ALIGNED(32) elems[2] =
828 double CV_DECL_ALIGNED(32) elems[2] =
840inline v_int8x16 v_lut(
const schar* tab,
const int* idx)
842 schar CV_DECL_ALIGNED(32) elems[16] =
864inline v_int8x16 v_lut_pairs(
const schar* tab,
const int* idx)
866 schar CV_DECL_ALIGNED(32) elems[16] =
888inline v_int8x16 v_lut_quads(
const schar* tab,
const int* idx)
890 schar CV_DECL_ALIGNED(32) elems[16] =
912inline v_uint8x16 v_lut(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
913inline v_uint8x16 v_lut_pairs(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
914inline v_uint8x16 v_lut_quads(
const uchar* tab,
const int* idx) {
return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
916inline v_int16x8 v_lut(
const short* tab,
const int* idx)
918 short CV_DECL_ALIGNED(32) elems[8] =
932inline v_int16x8 v_lut_pairs(
const short* tab,
const int* idx)
934 short CV_DECL_ALIGNED(32) elems[8] =
948inline v_int16x8 v_lut_quads(
const short* tab,
const int* idx)
950 short CV_DECL_ALIGNED(32) elems[8] =
964inline v_uint16x8 v_lut(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut((
short*)tab, idx)); }
965inline v_uint16x8 v_lut_pairs(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut_pairs((
short*)tab, idx)); }
966inline v_uint16x8 v_lut_quads(
const ushort* tab,
const int* idx) {
return v_reinterpret_as_u16(v_lut_quads((
short*)tab, idx)); }
968inline v_int32x4 v_lut(
const int* tab,
const int* idx)
970 int CV_DECL_ALIGNED(32) elems[4] =
980inline v_int32x4 v_lut_pairs(
const int* tab,
const int* idx)
982 int CV_DECL_ALIGNED(32) elems[4] =
992inline v_int32x4 v_lut_quads(
const int* tab,
const int* idx)
995 return v_int32x4(vle32_v_i32m1(tab + idx[0]));
998inline v_uint32x4 v_lut(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut((
int*)tab, idx)); }
999inline v_uint32x4 v_lut_pairs(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut_pairs((
int*)tab, idx)); }
1000inline v_uint32x4 v_lut_quads(
const unsigned* tab,
const int* idx) {
return v_reinterpret_as_u32(v_lut_quads((
int*)tab, idx)); }
1002inline v_int64x2 v_lut(
const int64_t* tab,
const int* idx)
1004 int64_t CV_DECL_ALIGNED(32) elems[2] =
1012inline v_int64x2 v_lut_pairs(
const int64* tab,
const int* idx)
1015 return v_int64x2(vle64_v_i64m1(tab + idx[0]));
1017inline v_uint64x2 v_lut(
const uint64* tab,
const int* idx) {
return v_reinterpret_as_u64(v_lut((
const int64_t *)tab, idx)); }
1018inline v_uint64x2 v_lut_pairs(
const uint64* tab,
const int* idx) {
return v_reinterpret_as_u64(v_lut_pairs((
const int64_t *)tab, idx)); }
1020inline v_float32x4 v_lut(
const float* tab,
const int* idx)
1022 float CV_DECL_ALIGNED(32) elems[4] =
1032inline v_float32x4 v_lut_pairs(
const float* tab,
const int* idx)
1034 float CV_DECL_ALIGNED(32) elems[4] =
1044inline v_float32x4 v_lut_quads(
const float* tab,
const int* idx)
1052 int CV_DECL_ALIGNED(32) elems[4] =
1054 tab[v_extract_n<0>(idxvec)],
1055 tab[v_extract_n<1>(idxvec)],
1056 tab[v_extract_n<2>(idxvec)],
1057 tab[v_extract_n<3>(idxvec)]
1065 unsigned CV_DECL_ALIGNED(32) elems[4] =
1067 tab[v_extract_n<0>(idxvec)],
1068 tab[v_extract_n<1>(idxvec)],
1069 tab[v_extract_n<2>(idxvec)],
1070 tab[v_extract_n<3>(idxvec)]
1078 float CV_DECL_ALIGNED(32) elems[4] =
1080 tab[v_extract_n<0>(idxvec)],
1081 tab[v_extract_n<1>(idxvec)],
1082 tab[v_extract_n<2>(idxvec)],
1083 tab[v_extract_n<3>(idxvec)]
1091 int CV_DECL_ALIGNED(32) idx[4];
1094 x =
v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1095 y =
v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1099inline v_float64x2 v_lut(
const double* tab,
const int* idx)
1101 double CV_DECL_ALIGNED(32) elems[2] =
1110inline v_float64x2 v_lut_pairs(
const double* tab,
const int* idx)
1118 double CV_DECL_ALIGNED(32) elems[2] =
1120 tab[v_extract_n<0>(idxvec)],
1121 tab[v_extract_n<1>(idxvec)]
1129 int CV_DECL_ALIGNED(32) idx[4] = {0};
1141 ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
1145 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
1151 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
1157 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
1164 uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
1174 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
1178#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
1179inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1181 vsetvlmax_e##width##m1(); \
1182 return _Tpvec(intrin(a, b)); \
1184inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1186 vsetvlmax_e##width##m1(); \
1187 a = _Tpvec(intrin(a, b)); \
1191OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint8x16, vsaddu_vv_u8m1, 8)
1192OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint8x16, vssubu_vv_u8m1, 8)
1193OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint8x16, vdivu_vv_u8m1, 8)
1194OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int8x16, vsadd_vv_i8m1, 8)
1195OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int8x16, vssub_vv_i8m1, 8)
1196OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int8x16, vdiv_vv_i8m1, 8)
1197OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint16x8, vsaddu_vv_u16m1, 16)
1198OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint16x8, vssubu_vv_u16m1, 16)
1199OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint16x8, vdivu_vv_u16m1, 16)
1200OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int16x8, vsadd_vv_i16m1, 16)
1201OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int16x8, vssub_vv_i16m1, 16)
1202OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int16x8, vdiv_vv_i16m1, 16)
1203OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint32x4, vadd_vv_u32m1, 32)
1204OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint32x4, vsub_vv_u32m1, 32)
1205OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint32x4, vmul_vv_u32m1, 32)
1206OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint32x4, vdivu_vv_u32m1, 32)
1207OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int32x4, vadd_vv_i32m1, 32)
1208OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int32x4, vsub_vv_i32m1, 32)
1209OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int32x4, vmul_vv_i32m1, 32)
1210OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int32x4, vdiv_vv_i32m1, 32)
1211OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float32x4, vfadd_vv_f32m1, 32)
1212OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float32x4, vfsub_vv_f32m1, 32)
1213OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float32x4, vfmul_vv_f32m1, 32)
1214OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float32x4, vfdiv_vv_f32m1, 32)
1215OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint64x2, vadd_vv_u64m1, 64)
1216OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint64x2, vsub_vv_u64m1, 64)
1217OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint64x2, vmul_vv_u64m1, 64)
1218OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint64x2, vdivu_vv_u64m1, 64)
1219OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int64x2, vadd_vv_i64m1, 64)
1220OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int64x2, vsub_vv_i64m1, 64)
1221OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int64x2, vmul_vv_i64m1, 64)
1222OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int64x2, vdiv_vv_i64m1, 64)
1224OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float64x2, vfadd_vv_f64m1, 64)
1225OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float64x2, vfsub_vv_f64m1, 64)
1226OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float64x2, vfmul_vv_f64m1, 64)
1227OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float64x2, vfdiv_vv_f64m1, 64)
1233#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
1234OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
1235OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
1236OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
1237inline _Tpvec operator ~ (const _Tpvec& a) \
1239 vsetvlmax_e##width##m1(); \
1240 return _Tpvec(vnot_v_##suffix##m1(a)); \
1243OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint8x16, u8, 8)
1244OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int8x16, i8, 8)
1245OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint16x8, u16, 16)
1246OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int16x8, i16, 16)
1247OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint32x4, u32, 32)
1248OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int32x4, i32, 32)
1249OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint64x2, u64, 64)
1250OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int64x2, i64, 64)
1252#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1253inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1255 vsetvlmax_e32m1(); \
1256 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1258inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1260 vsetvlmax_e32m1(); \
1261 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1265OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1266OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1267OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1272 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
1276#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1277inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1279 vsetvlmax_e64m1(); \
1280 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1282inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1284 vsetvlmax_e64m1(); \
1285 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1289OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1290OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1291OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
1302#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1303inline _Tpvec operator << (const _Tpvec& a, int n) \
1305 vsetvlmax_e##width##m1(); \
1306 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1308inline _Tpvec operator >> (const _Tpvec& a, int n) \
1310 vsetvlmax_e##width##m1(); \
1311 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1313template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1315 vsetvlmax_e##width##m1(); \
1316 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1318template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1320 vsetvlmax_e##width##m1(); \
1321 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1324#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1325inline _Tpvec operator << (const _Tpvec& a, int n) \
1327 vsetvlmax_e##width##m1(); \
1328 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1330inline _Tpvec operator >> (const _Tpvec& a, int n) \
1332 vsetvlmax_e##width##m1(); \
1333 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1335template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1337 vsetvlmax_e##width##m1(); \
1338 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1340template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1342 vsetvlmax_e##width##m1(); \
1343 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1346OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint8x16, u8, 8)
1347OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint16x8, u16, 16)
1348OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint32x4, u32, 32)
1349OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint64x2, u64, 64)
1350OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int8x16, i8, 8)
1351OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int16x8, i16, 16)
1352OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int32x4, i32, 32)
1353OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int64x2, i64, 64)
1358#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1359inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1361 vsetvlmax_e##width##m1(); \
1362 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1365#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1366inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1368 vsetvlmax_e##width##m1(); \
1369 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1372#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
1373OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1374OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1375OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
1376OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
1377OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
1378OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
1380#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
1381OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1382OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1383OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
1384OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
1385OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
1386OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
1388#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
1389OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
1390OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
1391OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
1392OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
1393OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
1394OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
1397OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint8x16, u8, 8)
1398OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint16x8, u16, 16)
1399OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint32x4, u32, 32)
1400OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint64x2, u64, 64)
1401OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int8x16, i8, 8)
1402OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int16x8, i16, 16)
1403OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int32x4, i32, 32)
1404OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int64x2, i64, 64)
1405OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float32x4, f32, 32)
1407OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float64x2, f64, 64)
1420#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
1421inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1423 vsetvlmax_e##width##m1(); \
1424 return _Tpvec(intrin(a, b)); \
1427OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 8)
1428OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
1429OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 8)
1430OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 8)
1431OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 16)
1432OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
1433OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 16)
1434OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 16)
1435OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 32)
1436OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
1437OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 32)
1438OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 32)
1439OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 32)
1440OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 32)
1441OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_min, vminu_vv_u64m1, 64)
1442OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
1443OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_min, vmin_vv_i64m1, 64)
1444OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_max, vmax_vv_i64m1, 64)
1446OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 64)
1447OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 64)
1452OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
1453OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
1454OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
1455OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
1456OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
1457OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
1458OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
1459OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
1460OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
1461OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
1462OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
1463OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
1467#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
1468inline scalartype v_reduce_sum(const _Tpvec& a) \
1470 vsetvlmax_e##wwidth##m1(); \
1471 _nwTpvec zero = vzero_##wsuffix##m1(); \
1472 _nwTpvec res = vzero_##wsuffix##m1(); \
1473 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
1474 return (scalartype)(_wTpvec(res).get0()); \
1477OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint8x16,
v_uint16x8, vuint16m1_t,
unsigned, u8, u16, 16, wredsumu)
1478OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int8x16,
v_int16x8, vint16m1_t,
int, i8, i16, 16, wredsum)
1479OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint16x8,
v_uint32x4, vuint32m1_t,
unsigned, u16, u32, 32, wredsumu)
1480OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int16x8,
v_int32x4, vint32m1_t,
int, i16, i32, 32, wredsum)
1481OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint32x4,
v_uint64x2, vuint64m1_t,
unsigned, u32, u64, 64, wredsumu)
1482OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int32x4,
v_int64x2, vint64m1_t,
int, i32, i64, 64, wredsum)
1484OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint64x2,
v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
1485OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int64x2,
v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
1491#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
1492inline scalartype v_reduce_##func(const _Tpvec& a) \
1494 vsetvlmax_e##width##m1(); \
1495 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
1496 return scalartype(res.get0()); \
1499OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint8x16,
min, uchar, u8, 8, redminu)
1500OPENCV_HAL_IMPL_RVV_REDUCE(
v_int8x16,
min, schar, i8, 8, redmin)
1501OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint16x8,
min, ushort, u16, 16, redminu)
1502OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8,
min,
short, i16, 16, redmin)
1503OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4,
min,
unsigned, u32, 32, redminu)
1504OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4,
min,
int, i32, 32, redmin)
1505OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4,
min,
float, f32, 32, fredmin)
1506OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint8x16,
max, uchar, u8, 8, redmaxu)
1507OPENCV_HAL_IMPL_RVV_REDUCE(
v_int8x16,
max, schar, i8, 8, redmax)
1508OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint16x8,
max, ushort, u16, 16, redmaxu)
1509OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8,
max,
short, i16, 16, redmax)
1510OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4,
max,
unsigned, u32, 32, redmaxu)
1511OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4,
max,
int, i32, 32, redmax)
1512OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4,
max,
float, f32, 32, fredmax)
1518 float CV_DECL_ALIGNED(32) elems[4] =
1540 return one / v_sqrt(x);
1553 return one / v_sqrt(x);
1560 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1567 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1574 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1581 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1595 return v_int32x4(vmacc_vv_i32m1(c, a, b));
1600 return v_fma(a, b, c);
1605 return v_fma(a, b, c);
1617 return v_fma(a, b, c);
1623#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
1624inline bool v_check_all(const _Tpvec& a) \
1626 vsetvlmax_e##width##m1(); \
1627 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
1628 return (v.val[0] | v.val[1]) == 0; \
1630inline bool v_check_any(const _Tpvec& a) \
1632 vsetvlmax_e##width##m1(); \
1633 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
1634 return (v.val[0] | v.val[1]) != 0; \
1637OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint8x16, u8, 7, 8)
1638OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint16x8, u16, 15, 16)
1639OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint32x4, u32, 31, 32)
1640OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint64x2, u64, 63, 64)
1677#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1678inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1680 return v_max(a, b) - v_min(a, b); \
1690OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int8x16, absdiffs)
1691OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int16x8, absdiffs)
1693#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
1694inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1696 vsetvlmax_e##width##m1(); \
1697 return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
1700OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int8x16,
v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
1701OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int16x8,
v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
1702OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int32x4,
v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
1704#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1705inline _Tprvec v_abs(const _Tpvec& a) \
1707 return v_absdiff(a, v_setzero_##suffix()); \
1719#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1720inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1722 return v_reduce_sum(v_absdiff(a, b)); \
1725OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint8x16,
unsigned)
1726OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int8x16,
unsigned)
1727OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint16x8,
unsigned)
1728OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int16x8,
unsigned)
1729OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint32x4,
unsigned)
1730OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int32x4,
unsigned)
1735#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
1736inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1738 vsetvlmax_e##width##m1(); \
1739 return _Tpvec(merge(ne(mask, 0), b, a)); \
1742OPENCV_HAL_IMPL_RVV_SELECT(
v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
1743OPENCV_HAL_IMPL_RVV_SELECT(
v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
1744OPENCV_HAL_IMPL_RVV_SELECT(
v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
1745OPENCV_HAL_IMPL_RVV_SELECT(
v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
1746OPENCV_HAL_IMPL_RVV_SELECT(
v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
1747OPENCV_HAL_IMPL_RVV_SELECT(
v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
1748OPENCV_HAL_IMPL_RVV_SELECT(
v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
1750OPENCV_HAL_IMPL_RVV_SELECT(
v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
1755#define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
1756template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1758 vsetvlmax_e##width##m1(); \
1759 return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1761template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1763 vsetvlmax_e##width##m1(); \
1764 return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1766template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1768template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1770 vsetvlmax_e##width##m1(); \
1771 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
1773template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1775 vsetvlmax_e##width##m1(); \
1776 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
1778template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1779{ CV_UNUSED(b); return a; }
1782OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_uint8x16, u8, 8)
1783OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_int8x16, i8, 8)
1784OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_uint16x8, u16, 16)
1785OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_int16x8, i16, 16)
1786OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_uint32x4, u32, 32)
1787OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_int32x4, i32, 32)
1788OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_float32x4, f32, 32)
1789OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_uint64x2, u64, 64)
1790OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_int64x2, i64, 64)
1792OPENCV_HAL_IMPL_RVV_ROTATE_OP(
v_float64x2, f64, 64)
1806 double arr[4] = {a.val[0], a.val[1], 0, 0};
1808 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1815 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
1817 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1824 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1826 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1827 double CV_DECL_ALIGNED(32) elems[2] =
1837 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1839 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1840 double CV_DECL_ALIGNED(32) elems[2] =
1850 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1852 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1853 double CV_DECL_ALIGNED(32) elems[2] =
1863 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1865 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1866 double CV_DECL_ALIGNED(32) elems[2] =
1883#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1884template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
1886 return v_setall_##suffix(v_extract_n<i>(v)); \
1890OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int8x16, s8)
1891OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint16x8, u16)
1892OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int16x8, s16)
1893OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint32x4, u32)
1894OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int32x4, s32)
1895OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint64x2, u64)
1896OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int64x2, s64)
1904#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
1905inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1906 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1907 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1908 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1910 _Tp CV_DECL_ALIGNED(32) elems0[4] = \
1912 v_extract_n<0>(a0), \
1913 v_extract_n<0>(a1), \
1914 v_extract_n<0>(a2), \
1915 v_extract_n<0>(a3) \
1917 b0 = v_load(elems0); \
1918 _Tp CV_DECL_ALIGNED(32) elems1[4] = \
1920 v_extract_n<1>(a0), \
1921 v_extract_n<1>(a1), \
1922 v_extract_n<1>(a2), \
1923 v_extract_n<1>(a3) \
1925 b1 = v_load(elems1); \
1926 _Tp CV_DECL_ALIGNED(32) elems2[4] = \
1928 v_extract_n<2>(a0), \
1929 v_extract_n<2>(a1), \
1930 v_extract_n<2>(a2), \
1931 v_extract_n<2>(a3) \
1933 b2 = v_load(elems2); \
1934 _Tp CV_DECL_ALIGNED(32) elems3[4] = \
1936 v_extract_n<3>(a0), \
1937 v_extract_n<3>(a1), \
1938 v_extract_n<3>(a2), \
1939 v_extract_n<3>(a3) \
1941 b3 = v_load(elems3); \
1944OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4,
unsigned, u32)
1945OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4,
int, i32)
1946OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4,
float, f32)
1950#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
1951inline _Tpvec v_reverse(const _Tpvec& a) \
1953 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
1954 _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
1956 for (int i = 0; i < _Tpvec::nlanes; i++) \
1958 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
1960 return v_load(ptr); \
1963OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint8x16, uchar, 8, u8)
1964OPENCV_HAL_IMPL_RVV_REVERSE(
v_int8x16, schar, 8, i8)
1965OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint16x8, ushort, 16, u16)
1966OPENCV_HAL_IMPL_RVV_REVERSE(
v_int16x8,
short, 16, i16)
1967OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint32x4,
unsigned, 32, u32)
1968OPENCV_HAL_IMPL_RVV_REVERSE(
v_int32x4,
int, 32, i32)
1969OPENCV_HAL_IMPL_RVV_REVERSE(
v_float32x4,
float, 32, f32)
1970OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint64x2, uint64, 64, u64)
1971OPENCV_HAL_IMPL_RVV_REVERSE(
v_int64x2, int64, 64, i64)
1973OPENCV_HAL_IMPL_RVV_REVERSE(
v_float64x2,
double, 64, f64)
1978#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
1979inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1981 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1982 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1983 v_store_low(lptr, a); \
1984 v_store_high(hptr, a); \
1985 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1986 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1988inline _Tpwvec v_expand_low(const _Tpvec& a) \
1990 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1991 v_store_low(lptr, a); \
1992 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1994inline _Tpwvec v_expand_high(const _Tpvec& a) \
1996 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1997 v_store_high(hptr, a); \
1998 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
2000inline _Tpwvec v_load_expand(const _Tp* ptr) \
2002 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
2015 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
2021 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
2025#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
2026inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2028 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2030 v_store(arr + _wTpvec::nlanes, b); \
2031 vsetvlmax_e##width##m2(); \
2032 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
2034inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2036 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2038 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2039 vsetvlmax_e##width##m2(); \
2040 v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
2042template<int n> inline \
2043_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2045 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2047 v_store(arr + _wTpvec::nlanes, b); \
2048 vsetvlmax_e##width##m2(); \
2049 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
2051template<int n> inline \
2052void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2054 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2056 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2057 vsetvlmax_e##width##m2(); \
2058 v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
2061OPENCV_HAL_IMPL_RVV_PACK(
v_uint8x16, uchar,
v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
2062OPENCV_HAL_IMPL_RVV_PACK(
v_int8x16, schar,
v_int16x8,
short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
2063OPENCV_HAL_IMPL_RVV_PACK(
v_uint16x8, ushort,
v_uint32x4,
unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
2064OPENCV_HAL_IMPL_RVV_PACK(
v_int16x8,
short,
v_int32x4,
int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
2065OPENCV_HAL_IMPL_RVV_PACK(
v_uint32x4,
unsigned,
v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
2066OPENCV_HAL_IMPL_RVV_PACK(
v_int32x4,
int,
v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
2069#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
2070inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2072 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2074 v_store(arr + _wTpvec::nlanes, b); \
2075 vsetvlmax_e##width##m2(); \
2076 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
2078inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2080 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2082 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2083 vsetvlmax_e##width##m2(); \
2084 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
2086template<int n> inline \
2087_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2089 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2091 v_store(arr + _wTpvec::nlanes, b); \
2092 vsetvlmax_e##width##m2(); \
2093 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
2095template<int n> inline \
2096void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2098 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2100 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2101 vsetvlmax_e##width##m2(); \
2102 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
2105OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint8x16, uchar,
v_int16x8,
short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
2106OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint16x8, ushort,
v_int32x4,
int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
2109#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
2110inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2112 _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
2113 _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
2114 _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
2115 _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
2116 v_store(ptra0, a0); \
2117 v_store(ptra1, a1); \
2119 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2121 ptrb0[i*2] = ptra0[i]; \
2122 ptrb0[i*2+1] = ptra1[i]; \
2124 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2126 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2127 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2129 b0 = v_load(ptrb0); \
2130 b1 = v_load(ptrb1); \
2132inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2134 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2135 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2136 v_store_low(ptra, a); \
2137 v_store_low(ptrb, b); \
2138 return v_load_halves(ptra, ptrb); \
2140inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2142 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2143 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2144 v_store_high(ptra, a); \
2145 v_store_high(ptrb, b); \
2146 return v_load_halves(ptra, ptrb); \
2148inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2150 c = v_combine_low(a, b); \
2151 d = v_combine_high(a, b); \
2154OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
2155OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
2156OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
2157OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8,
short, 16, i16)
2158OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4,
unsigned, 32, u32)
2159OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4,
int, 32, i32)
2160OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4,
float, 32, f32)
2162OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2,
double, 64, f64)
2166#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
2167inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2169 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2170 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2172 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2174 ptra[i] = ptr[i2]; \
2175 ptrb[i] = ptr[i2+1]; \
2180inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2182 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2183 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2184 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2186 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2188 ptra[i] = ptr[i3]; \
2189 ptrb[i] = ptr[i3+1]; \
2190 ptrc[i] = ptr[i3+2]; \
2196inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2197 v_##_Tpvec& c, v_##_Tpvec& d) \
2199 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2200 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2201 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2202 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2204 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2206 ptra[i] = ptr[i4]; \
2207 ptrb[i] = ptr[i4+1]; \
2208 ptrc[i] = ptr[i4+2]; \
2209 ptrd[i] = ptr[i4+3]; \
2216inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2217 hal::StoreMode =hal::STORE_UNALIGNED) \
2220 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2221 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2224 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2226 ptr[i2] = ptra[i]; \
2227 ptr[i2+1] = ptrb[i]; \
2230inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2231 const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2234 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2235 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2236 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2240 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2242 ptr[i3] = ptra[i]; \
2243 ptr[i3+1] = ptrb[i]; \
2244 ptr[i3+2] = ptrc[i]; \
2247inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2248 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2249 hal::StoreMode =hal::STORE_UNALIGNED ) \
2252 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2253 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2254 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2255 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2260 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2262 ptr[i4] = ptra[i]; \
2263 ptr[i4+1] = ptrb[i]; \
2264 ptr[i4+2] = ptrc[i]; \
2265 ptr[i4+3] = ptrd[i]; \
2268inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2270 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2271 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2272 v_store(ptrvec, vec); \
2273 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2275 ptr[4*i ] = ptrvec[4*i ]; \
2276 ptr[4*i+1] = ptrvec[4*i+2]; \
2277 ptr[4*i+2] = ptrvec[4*i+1]; \
2278 ptr[4*i+3] = ptrvec[4*i+3]; \
2280 return v_load(ptr); \
2282inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2284 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2285 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2286 v_store(ptrvec, vec); \
2287 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2289 ptr[8*i ] = ptrvec[4*i ]; \
2290 ptr[8*i+1] = ptrvec[4*i+4]; \
2291 ptr[8*i+2] = ptrvec[4*i+1]; \
2292 ptr[8*i+3] = ptrvec[4*i+5]; \
2293 ptr[8*i+4] = ptrvec[4*i+2]; \
2294 ptr[8*i+5] = ptrvec[4*i+6]; \
2295 ptr[8*i+6] = ptrvec[4*i+3]; \
2296 ptr[8*i+7] = ptrvec[4*i+7]; \
2298 return v_load(ptr); \
2301OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
2302OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
2303OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
2304OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8,
short, i16, 16)
2305OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4,
unsigned, u32, 32)
2306OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4,
int, i32, 32)
2307OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4,
float, f32, 32)
2308OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
2309OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
2311OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2,
double, f64, 64)
2316static const unsigned char popCountTable[] =
2318 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2319 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2320 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2321 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2322 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2323 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2324 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2325 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2326 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2327 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2328 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2329 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2330 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2331 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2332 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2333 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2336#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2337inline _rTpvec v_popcount(const _Tpvec& a) \
2339 uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
2340 v_store(ptra, v_reinterpret_as_u8(a)); \
2341 _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2342 v_store(ptr, v_setzero_##suffix()); \
2343 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2344 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2345 return v_load(ptr); \
2359#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
2360inline int v_signmask(const _Tpvec& a) \
2363 vsetvlmax_e##width##m1(); \
2364 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
2365 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2366 mask |= (int)(tmp.val[i]) << i; \
2370OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint8x16, uchar, u8, 8, 7)
2371OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint16x8, ushort, u16, 16, 15)
2372OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint32x4,
unsigned, u32, 32, 31)
2373OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint64x2, uint64, u64, 64, 63)
2376{
return v_signmask(v_reinterpret_as_u8(a)); }
2378{
return v_signmask(v_reinterpret_as_u16(a)); }
2380{
return v_signmask(v_reinterpret_as_u32(a)); }
2382{
return v_signmask(v_reinterpret_as_u32(a)); }
2384{
return v_signmask(v_reinterpret_as_u64(a)); }
2387{
return v_signmask(v_reinterpret_as_u64(a)); }
2393#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2394inline int v_scan_forward(const _Tpvec& a) \
2396 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2397 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2398 for (int i = 0; i < _Tpvec::nlanes; i++) \
2399 if(int(ptr[i]) < 0) \
2404OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint8x16, uchar, u8)
2405OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int8x16, schar, s8)
2406OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint16x8, ushort, u16)
2407OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int16x8,
short, s16)
2408OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint32x4,
unsigned, u32)
2409OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int32x4,
int, s32)
2410OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float32x4,
float, f32)
2411OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint64x2, uint64, u64)
2412OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int64x2, int64, s64)
2414OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float64x2,
double, f64)
2419#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
2420inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
2422 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2423 _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
2424 v_store(ptrvec, vec); \
2425 for (int i = 0; i < _Tpvec::nlanes/4; i++) \
2427 ptr[3*i ] = ptrvec[4*i ]; \
2428 ptr[3*i+1] = ptrvec[4*i+2]; \
2429 ptr[3*i+2] = ptrvec[4*i+2]; \
2431 return v_load(ptr); \
2434OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_uint8x16, uchar)
2435OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_int8x16, schar)
2436OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_uint16x8, ushort)
2437OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_int16x8,
short)
2438OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_uint32x4,
unsigned)
2439OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_int32x4,
int)
2440OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(
v_float32x4,
float)
2448 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
2451inline void v_pack_store(float16_t* ptr,
const v_float32x4& v)
2453 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
2460 for(
int i = 0; i < N; i++ ) buf[i] = (
float)ptr[i];
2464inline void v_pack_store(float16_t* ptr,
const v_float32x4& v)
2469 for(
int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2500 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
2505 double arr[4] = {a.val[0], a.val[1], 0, 0};
2507 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2508 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2513 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2515 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2516 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2521 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2523 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2524 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2529 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2531 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2532 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2537 double arr[4] = {a.val[0], a.val[1], 0, 0};
2539 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2540 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
2550 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2553 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2559 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2562 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2570 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2573 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2579 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2582 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2590 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2593 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2595 return t1 + t2 + t3 + t4;
2600 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2603 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2605 return t1 + t2 + t3 + t4 + c;
2610 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2613 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2615 return t1 + t2 + t3 + t4;
2620 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2623 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2625 return t1 + t2 + t3 + t4 + c;
2631 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2634 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2636 return t1 + t2 + t3 + t4;
2640 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2643 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2645 return t1 + t2 + t3 + t4 + c;
2650 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2653 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2655 return t1 + t2 + t3 + t4;
2660 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2663 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2665 return t1 + t2 + t3 + t4 + c;
2682 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2684 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2691 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2693 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2702 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2704 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2711 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2713 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2723 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2725 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2730 return t1 + t2 + t3 + t4;
2734 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2736 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2741 return t1 + t2 + t3 + t4 + c;
2745 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2747 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2752 return t1 + t2 + t3 + t4;
2756 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2758 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2763 return t1 + t2 + t3 + t4 + c;
2769 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2771 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2776 return t1 + t2 + t3 + t4;
2780 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2782 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2787 return t1 + t2 + t3 + t4 + c;
2791 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2793 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2798 return t1 + t2 + t3 + t4;
2802 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2804 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2809 return t1 + t2 + t3 + t4 + c;
2826 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2827 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2828 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2829 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
2838 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2839 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2840 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2844#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
2845inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
2847 _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
2848 vsetvlmax_e##width##m2(); \
2849 vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
2850 vsetvlmax_e##width##m1(); \
2851 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
2852 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
2865 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
2870 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
2876#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
2877inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2880 v_mul_expand(a, b, c, d); \
2881 return v_pack(c, d); \
2883inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2895inline void v_cleanup() {}
2897CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks
Definition: intrin_cpp.hpp:1784
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75
Definition: intrin_rvv.hpp:421
Definition: intrin_rvv.hpp:517
Definition: intrin_rvv.hpp:326
Definition: intrin_rvv.hpp:390
Definition: intrin_rvv.hpp:484
Definition: intrin_rvv.hpp:261
Definition: intrin_rvv.hpp:294
Definition: intrin_rvv.hpp:358
Definition: intrin_rvv.hpp:452
Definition: intrin_rvv.hpp:228
Definition: intrin_rvv.hpp:92
Definition: intrin_rvv.hpp:120
Definition: intrin_rvv.hpp:60
Definition: intrin_rvv.hpp:82
Definition: intrin_rvv.hpp:111
Definition: intrin_rvv.hpp:36
Definition: intrin_rvv.hpp:141
Definition: intrin_rvv.hpp:48
Definition: intrin_rvv.hpp:72
Definition: intrin_rvv.hpp:102
Definition: intrin_rvv.hpp:24
Definition: intrin_rvv.hpp:129