OpenCV 4.5.3(日本語機械翻訳)
intrin_rvv071.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5 // Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
6
7 #ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8 #define OPENCV_HAL_INTRIN_RISCVV_HPP
9
10 #include <float.h>
11 #include <algorithm>
12 #include "opencv2/core/utility.hpp"
13
14 namespace cv
15{
16
18
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
21 #define CV_SIMD128 1
22 #define CV_SIMD128_64F 1
24 struct v_uint8x16
25{
26 typedef uchar lane_type;
27 enum { nlanes = 16 };
28
29 v_uint8x16() {}
30 explicit v_uint8x16(vuint8m1_t v) : val(v) {}
31 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33 {
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
36 }
37 uchar get0() const
38 {
39 return vmv_x_s_u8m1_u8(val, 16);
40 }
41
42 vuint8m1_t val;
43};
44
45 struct v_int8x16
46{
47 typedef schar lane_type;
48 enum { nlanes = 16 };
49
50 v_int8x16() {}
51 explicit v_int8x16(vint8m1_t v) : val(v) {}
52 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54 {
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
57 }
58 schar get0() const
59 {
60 return vmv_x_s_i8m1_i8(val, 16);
61 }
62
63 vint8m1_t val;
64};
65
66 struct v_uint16x8
67{
68 typedef ushort lane_type;
69 enum { nlanes = 8 };
70
71 v_uint16x8() {}
72 explicit v_uint16x8(vuint16m1_t v) : val(v) {}
73 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
74 {
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
77 }
78 ushort get0() const
79 {
80 return vmv_x_s_u16m1_u16(val, 8);
81 }
82
83 vuint16m1_t val;
84};
85
86 struct v_int16x8
87{
88 typedef short lane_type;
89 enum { nlanes = 8 };
90
91 v_int16x8() {}
92 explicit v_int16x8(vint16m1_t v) : val(v) {}
93 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
94 {
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
97 }
98 short get0() const
99 {
100 return vmv_x_s_i16m1_i16(val, 8);
101 }
102
103 vint16m1_t val;
104};
105
106 struct v_uint32x4
107{
108 typedef unsigned lane_type;
109 enum { nlanes = 4 };
110
111 v_uint32x4() {}
112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
114 {
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
117 }
118 unsigned get0() const
119 {
120 return vmv_x_s_u32m1_u32(val, 4);
121 }
122
123 vuint32m1_t val;
124};
125
126 struct v_int32x4
127{
128 typedef int lane_type;
129 enum { nlanes = 4 };
130
131 v_int32x4() {}
132 explicit v_int32x4(vint32m1_t v) : val(v) {}
133 v_int32x4(int v0, int v1, int v2, int v3)
134 {
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
137 }
138 int get0() const
139 {
140 return vmv_x_s_i32m1_i32(val, 4);
141 }
142 vint32m1_t val;
143};
144
145 struct v_float32x4
146{
147 typedef float lane_type;
148 enum { nlanes = 4 };
149
150 v_float32x4() {}
151 explicit v_float32x4(vfloat32m1_t v) : val(v) {}
152 v_float32x4(float v0, float v1, float v2, float v3)
153 {
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
156 }
157 float get0() const
158 {
159 return vfmv_f_s_f32m1_f32(val, 4);
160 }
161 vfloat32m1_t val;
162};
163
164 struct v_uint64x2
165{
166 typedef uint64 lane_type;
167 enum { nlanes = 2 };
168
169 v_uint64x2() {}
170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
171 v_uint64x2(uint64 v0, uint64 v1)
172 {
173 uint64 v[] = {v0, v1};
174 val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
175 }
176 uint64 get0() const
177 {
178 return vmv_x_s_u64m1_u64(val, 2);
179 }
180 vuint64m1_t val;
181};
182
183 struct v_int64x2
184{
185 typedef int64 lane_type;
186 enum { nlanes = 2 };
187
188 v_int64x2() {}
189 explicit v_int64x2(vint64m1_t v) : val(v) {}
190 v_int64x2(int64 v0, int64 v1)
191 {
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
194 }
195 int64 get0() const
196 {
197 return vmv_x_s_i64m1_i64(val, 2);
198 }
199 vint64m1_t val;
200};
201
202 struct v_float64x2
203{
204 typedef double lane_type;
205 enum { nlanes = 2 };
206
207 v_float64x2() {}
208 explicit v_float64x2(vfloat64m1_t v) : val(v) {}
209 v_float64x2(double v0, double v1)
210 {
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
213 }
214 double get0() const
215 {
216 return vfmv_f_s_f64m1_f64(val, 2);
217 }
218 vfloat64m1_t val;
219};
220
221 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222 inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
223 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
227 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
233
234
235OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
237OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
239OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
241OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
243OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
246 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \
247 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
248
249OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
250OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
251OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
252OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
253OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
254OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
255OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
256OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
257inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
258 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
259
260 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
261 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
262
263
264 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
265 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
266 { \
267 return _Tpvec(intrin(a.val, b.val)); \
268 } \
269 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
270 { \
271 a.val = intrin(a.val, b.val); \
272 return a; \
273 }
274
275 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
276 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
277 { \
278 return _Tpvec(intrin(a.val, b.val, num)); \
279 } \
280 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
281 { \
282 a.val = intrin(a.val, b.val, num); \
283 return a; \
284 }
285
286OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
287OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
288OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
289OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
290OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
291OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
292OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
293OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
294OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
295OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
296OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
297OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
298OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
299OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
300OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
301OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
302OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
303OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
304OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
305OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
306OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
307inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
308{
309 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
310}
311 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
312{
313 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
314 return a;
315}
316
317OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
318OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
319OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
320inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
321{
322 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
323}
324 inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
325{
326 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
327 return a;
328}
329 // TODO: exp, log, sin, cos
330
331 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
332 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
333 { \
334 return _Tpvec(intrin(a.val, b.val)); \
335 }
336
337 #define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
338 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
339 { \
340 return _Tpvec(intrin(a.val, b.val, num)); \
341 }
342OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
343OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
344OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
345OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
346OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
347OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
348OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
349OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
350OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
351OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
352OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
353OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
354OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
355OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
356OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
357OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
358
359inline v_float32x4 v_sqrt(const v_float32x4& x)
360{
361 return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
362}
363
364 inline v_float32x4 v_invsqrt(const v_float32x4& x)
365{
366 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
367}
368
369 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
370{
371 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
372 return v_sqrt(x);
373}
374
375 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
376{
377 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
378}
379
380 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
381{
382 return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
383}
384
385 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
386{
387 return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
388}
389
390 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
391{
392 return v_fma(a, b, c);
393}
394
395 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
396{
397 return v_fma(a, b, c);
398}
399
400 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
401 const v_float32x4& m1, const v_float32x4& m2,
402 const v_float32x4& m3)
403{
404 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
405 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
406 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
407 res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
408 return v_float32x4(res);
409}
410
411 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
412 const v_float32x4& m1, const v_float32x4& m2,
413 const v_float32x4& a)
414{
415 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
416 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
417 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
418 res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
419 return v_float32x4(res);
420}
421
422 inline v_float64x2 v_sqrt(const v_float64x2& x)
423{
424 return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
425}
426
427 inline v_float64x2 v_invsqrt(const v_float64x2& x)
428{
429 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
430}
431
432 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
433{
434 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
435 return v_sqrt(x);
436}
437
438 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
439{
440 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
441}
442
443 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
444{
445 return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
446}
447
448 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
449{
450 return v_fma(a, b, c);
451}
452
453 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
454 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
455 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
456 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
457 inline _Tpvec operator ~ (const _Tpvec & a) \
458 { \
459 return _Tpvec(vnot_v_##suffix(a.val, num)); \
460 }
461
462OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
463OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
464OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
465OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
466OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16, i8m1, 16)
467OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8, i16m1, 8)
468OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
469OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
470
471 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
472 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
473 { \
474 return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
475 } \
476 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
477 { \
478 a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
479 return a; \
480 }
481
482OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
483OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
484OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
485
486 inline v_float32x4 operator ~ (const v_float32x4& a)
487{
488 return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
489}
490
491 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
492 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
493 { \
494 return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
495 } \
496 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
497 { \
498 a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
499 return a; \
500 }
501
502OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
503OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
504OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
505
506 inline v_float64x2 operator ~ (const v_float64x2& a)
507{
508 return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
509}
510 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
511{
512 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
513}
514 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
515{
516 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
517}
518
519 //#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
520 //inline _Tpuvec v_abs(const _Tpsvec& a) { \
521 // E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
522
523 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
524 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
525 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
526
527 inline v_uint32x4 v_abs(v_int32x4 x)
528{
529 vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
530 return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
531}
532
533 inline v_uint16x8 v_abs(v_int16x8 x)
534{
535 vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
536 return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
537}
538
539 inline v_uint8x16 v_abs(v_int8x16 x)
540{
541 vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
542 return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
543}
544
545 inline v_float32x4 v_abs(v_float32x4 x)
546{
547 return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
548}
549
550 inline v_float64x2 v_abs(v_float64x2 x)
551{
552 return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
553}
554
555 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
556{
557 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
558 return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
559}
560
561 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
562{
563 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
564 return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
565}
566
567 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
568 inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
569 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
570 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
571 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
572 }
573
574OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
575OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
576OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
577
580 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
581 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
582 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
583}
585 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
586 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
587 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
588}
589
590 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
591 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
592 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
593 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
594 return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \
595 }
596
597OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
598OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
599OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
600
601 // Multiply and expand
602inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
603 v_int16x8& c, v_int16x8& d)
604{
605 vint16m2_t res = vundefined_i16m2();
606 res = vwmul_vv_i16m2(a.val, b.val, 16);
607 c.val = vget_i16m2_i16m1(res, 0);
608 d.val = vget_i16m2_i16m1(res, 1);
609}
610
611 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
612 v_uint16x8& c, v_uint16x8& d)
613{
614 vuint16m2_t res = vundefined_u16m2();
615 res = vwmulu_vv_u16m2(a.val, b.val, 16);
616 c.val = vget_u16m2_u16m1(res, 0);
617 d.val = vget_u16m2_u16m1(res, 1);
618}
619
620 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
621 v_int32x4& c, v_int32x4& d)
622{
623 vint32m2_t res = vundefined_i32m2();
624 res = vwmul_vv_i32m2(a.val, b.val, 8);
625 c.val = vget_i32m2_i32m1(res, 0);
626 d.val = vget_i32m2_i32m1(res, 1);
627}
628
629 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
630 v_uint32x4& c, v_uint32x4& d)
631{
632 vuint32m2_t res = vundefined_u32m2();
633 res = vwmulu_vv_u32m2(a.val, b.val, 8);
634 c.val = vget_u32m2_u32m1(res, 0);
635 d.val = vget_u32m2_u32m1(res, 1);
636}
637
638 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
639 v_int64x2& c, v_int64x2& d)
640{
641 vint64m2_t res = vundefined_i64m2();
642 res = vwmul_vv_i64m2(a.val, b.val, 4);
643 c.val = vget_i64m2_i64m1(res, 0);
644 d.val = vget_i64m2_i64m1(res, 1);
645}
646
647 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
648 v_uint64x2& c, v_uint64x2& d)
649{
650 vuint64m2_t res = vundefined_u64m2();
651 res = vwmulu_vv_u64m2(a.val, b.val, 4);
652 c.val = vget_u64m2_u64m1(res, 0);
653 d.val = vget_u64m2_u64m1(res, 1);
654}
655
656OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
657OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
658OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
659OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
660OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
661OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
662OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
663OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
664OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
665OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
666OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
667OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
669 // 16 >> 32
670inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
671{
672 vint32m2_t res = vundefined_i32m2();
673 res = vwmul_vv_i32m2(a.val, b.val, 8);
674 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
675 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
676}
677 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
678{
679 vint32m2_t res = vundefined_i32m2();
680 res = vwmul_vv_i32m2(a.val, b.val, 8);
681 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
682 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
683}
684
685 // 32 >> 64
686 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
687{
688 vint64m2_t res = vundefined_i64m2();
689 res = vwmul_vv_i64m2(a.val, b.val, 4);
690 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
691 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
692}
693 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
694{
695 vint64m2_t res = vundefined_i64m2();
696 res = vwmul_vv_i64m2(a.val, b.val, 4);
697 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
698 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
699}
700
701 // 8 >> 32
702 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
703{
704 vuint16m2_t v1 = vundefined_u16m2();
705 vuint32m2_t v2 = vundefined_u32m2();
706 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
707 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
708 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
709 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
710}
711
712 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
713 const v_uint32x4& c)
714{
715 vuint16m2_t v1 = vundefined_u16m2();
716 vuint32m2_t v2 = vundefined_u32m2();
717 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
718 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
719 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
720 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
721}
722
723 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
724{
725 vint16m2_t v1 = vundefined_i16m2();
726 vint32m2_t v2 = vundefined_i32m2();
727 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
728 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
729 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
730 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
731}
732
733 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
734 const v_int32x4& c)
735{
736 vint16m2_t v1 = vundefined_i16m2();
737 vint32m2_t v2 = vundefined_i32m2();
738 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
739 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
740 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
741 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
742}
743
744 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
745{
746 vuint32m2_t v1 = vundefined_u32m2();
747 vuint64m2_t v2 = vundefined_u64m2();
748 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
749 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
750 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
751 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
752}
753
754 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
755 const v_uint64x2& c)
756{
757 vuint32m2_t v1 = vundefined_u32m2();
758 vuint64m2_t v2 = vundefined_u64m2();
759 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
760 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
761 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
762 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
763}
764
765 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
766{
767 vint32m2_t v1 = vundefined_i32m2();
768 vint64m2_t v2 = vundefined_i64m2();
769 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
770 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
771 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
772 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
773}
774
775 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
776 const v_int64x2& c)
777{
778 vint32m2_t v1 = vundefined_i32m2();
779 vint64m2_t v2 = vundefined_i64m2();
780 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
781 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
782 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
783 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
784}
785
787 // 16 >> 32
788 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
789{
790 vint32m2_t v1 = vundefined_i32m2();
791 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
792 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
793}
794
795 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
796{
797 vint32m2_t v1 = vundefined_i32m2();
798 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
799 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
800}
801
802 // 32 >> 64
803 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
804{
805 vint64m2_t v1 = vundefined_i64m2();
806 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
807 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
808}
809 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
810{
811 vint64m2_t v1 = vundefined_i64m2();
812 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
813 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
814}
815
816 // 8 >> 32
817 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
818{
819 vuint16m2_t v1 = vundefined_u16m2();
820 vuint32m2_t v2 = vundefined_u32m2();
821 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
822 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
823 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
824}
825
826 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
827{
828 vuint16m2_t v1 = vundefined_u16m2();
829 vuint32m2_t v2 = vundefined_u32m2();
830 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
831 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
832 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
833}
834
835 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
836{
837 vint16m2_t v1 = vundefined_i16m2();
838 vint32m2_t v2 = vundefined_i32m2();
839 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
840 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
841 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
842}
843 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
844{
845 vint16m2_t v1 = vundefined_i16m2();
846 vint32m2_t v2 = vundefined_i32m2();
847 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
848 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
849 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
850}
851
852 // 16 >> 64
853 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
854{
855 vuint32m2_t v1 = vundefined_u32m2();
856 vuint64m2_t v2 = vundefined_u64m2();
857 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
858 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
859 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
860}
861 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
862{
863 vuint32m2_t v1 = vundefined_u32m2();
864 vuint64m2_t v2 = vundefined_u64m2();
865 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
866 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
867 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
868}
869
870 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
871{
872 vint32m2_t v1 = vundefined_i32m2();
873 vint64m2_t v2 = vundefined_i64m2();
874 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
875 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
876 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
877}
878 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
879{
880 vint32m2_t v1 = vundefined_i32m2();
881 vint64m2_t v2 = vundefined_i64m2();
882 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
883 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
884 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
885}
886
887
888 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
889 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
890 {\
891 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
892 val = intrin(val, a.val, val, num); \
893 return vmv_x_s_##len##m1_##len(val, num); \
894 }
895
896
897 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
898 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
899 {\
900 v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
901 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
902 return val[0]; \
903 }
904OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
905OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
906OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
907OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
908OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
909OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
910inline float v_reduce_sum(const v_float32x4& a) \
911{\
912 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
913 val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \
914 return vfmv_f_s_f32m1_f32(val, 4); \
915}
916 inline double v_reduce_sum(const v_float64x2& a) \
917{\
918 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
919 val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \
920 return vfmv_f_s_f64m1_f64(val, 2); \
921}
922 inline uint64 v_reduce_sum(const v_uint64x2& a)
923{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
924
925 inline int64 v_reduce_sum(const v_int64x2& a)
926{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
927
928 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
929 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \
930 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \
931 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \
932 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \
933 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \
934 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \
935 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \
936 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
937OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
938OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
939
940 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
941 const v_float32x4& c, const v_float32x4& d)
942{
943 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
944 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
945 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
946 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
947 a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
948 b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
949 c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
950 d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
951 return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
952}
953
954 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
955{
956 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
957 vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
958 vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
959 vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
960 a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
961 return a0[0];
962}
963
964 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
965 inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
966 _Tpvec2 x = v_absdiff(a, b); \
967 return v_reduce_sum(x); \
968 }
969
970OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
971OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
972OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
973OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
974OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
975OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
976
977 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
978 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
979 { \
980 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
981 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
982 } \
983 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
984 { \
985 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
986 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
987 } \
988 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
989 { \
990 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
991 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
992 } \
993 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
994 { \
995 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
996 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
997 } \
998 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
999 { \
1000 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1001 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1002 } \
1003 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1004 { \
1005 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1006 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1007 } \
1008
1009OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1, 8, 16, _vv_)
1010OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
1011OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
1012OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
1013OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
1014OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
1015OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
1016OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
1017
1018 //TODO: ==
1019 inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
1020{
1021 vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1022 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1023 return v_float32x4((vfloat32m1_t)res);
1024}
1025 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
1026{
1027 vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1028 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1029 return v_float32x4((vfloat32m1_t)res);
1030}
1031 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
1032{
1033 vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1034 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1035 return v_float32x4((vfloat32m1_t)res);
1036}
1037 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
1038{
1039 vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1040 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1041 return v_float32x4((vfloat32m1_t)res);
1042}
1043 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
1044{
1045 vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1046 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1047 return v_float32x4((vfloat32m1_t)res);
1048}
1049 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
1050{
1051 vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1052 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1053 return v_float32x4((vfloat32m1_t)res);
1054}
1055 inline v_float32x4 v_not_nan(const v_float32x4& a)
1056{
1057 vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
1058 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1059 return v_float32x4((vfloat32m1_t)res);
1060}
1061
1062 //TODO: ==
1063 inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
1064{
1065 vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1066 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1067 return v_float64x2((vfloat64m1_t)res);
1068}
1069 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
1070{
1071 vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1072 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1073 return v_float64x2((vfloat64m1_t)res);
1074}
1075 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
1076{
1077 vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1078 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1079 return v_float64x2((vfloat64m1_t)res);
1080}
1081 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
1082{
1083 vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1084 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1085 return v_float64x2((vfloat64m1_t)res);
1086}
1087 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
1088{
1089 vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1090 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1091 return v_float64x2((vfloat64m1_t)res);
1092}
1093 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
1094{
1095 vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1096 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1097 return v_float64x2((vfloat64m1_t)res);
1098}
1099 inline v_float64x2 v_not_nan(const v_float64x2& a)
1100{
1101 vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
1102 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1103 return v_float64x2((vfloat64m1_t)res);
1104}
1105 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1106 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1107 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1108 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1109 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1110 { \
1111 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1112 val = vset_##_T##m4(val, 0, a0.val); \
1113 val = vset_##_T##m4(val, 1, a1.val); \
1114 val = vset_##_T##m4(val, 2, a2.val); \
1115 val = vset_##_T##m4(val, 3, a3.val); \
1116 val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \
1117 b0.val = vget_##_T##m4_##_T##m1(val, 0); \
1118 b1.val = vget_##_T##m4_##_T##m1(val, 1); \
1119 b2.val = vget_##_T##m4_##_T##m1(val, 2); \
1120 b3.val = vget_##_T##m4_##_T##m1(val, 3); \
1121 }
1122OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1123OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
1124OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
1125
1126
1127 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1128 inline _Tpvec operator << (const _Tpvec& a, int n) \
1129 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1130 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1131 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1132
1133 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1134 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1135 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1136 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1137 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1138 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1139 { return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1140
1141 // trade efficiency for convenience
1142 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1143 OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1144 OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1145
1146OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1147OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1148OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1149OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1150OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1151OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1152OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1153OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1154
1155 #if 0
1156 #define VUP4(n) {0, 1, 2, 3}
1157 #define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1158 #define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1159 #define VUP2(n) {0, 1}
1160 #endif
1161 #define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1162 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1163 { \
1164 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1165 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1166 return _Tpvec(tmp);\
1167 } \
1168 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1169 { \
1170 return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
1171 } \
1172 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1173 { return a; } \
1174 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176 suffix##m2_t tmp = vundefined_##_T##m2(); \
1177 tmp = vset_##_T##m2(tmp, 0, a.val); \
1178 tmp = vset_##_T##m2(tmp, 1, b.val); \
1179 tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
1180 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
1181 } \
1182 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1183 { \
1184 suffix##m2_t tmp = vundefined_##_T##m2(); \
1185 tmp = vset_##_T##m2(tmp, 0, b.val); \
1186 tmp = vset_##_T##m2(tmp, 1, a.val); \
1187 tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
1188 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
1189 } \
1190 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1191 { \
1192 CV_UNUSED(b); return a; \
1193 }
1194
1195OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1196OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1197OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1198OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1199OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1200OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1201OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1202OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1203OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1204OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1205
1206 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
1207 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1208 { \
1209 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1210 vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
1211 return _Tpvec(_Tp2##_t(tmp)); } \
1212 inline _Tpvec v_load_low(const _Tp* ptr) \
1213 { return _Tpvec(vle_v_##len(ptr, hnum)); }\
1214 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1215 { return _Tpvec(vle_v_##len(ptr, num)); } \
1216 inline _Tpvec v_load(const _Tp* ptr) \
1217 { return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
1218 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1219 { vse_v_##len(ptr, a.val, hnum);}\
1220 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1221 { \
1222 _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \
1223 vse_v_##len(ptr, a0, hnum);}\
1224 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1225 { vse_v_##len(ptr, a.val, num); } \
1226 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1227 { vse_v_##len(ptr, a.val, num); } \
1228 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1229 { vse_v_##len(ptr, a.val, num); } \
1230 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/ ) \
1231 { vse_v_##len(ptr, a.val, num); }
1232
1233OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
1234OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16)
1235OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
1236OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8)
1237OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
1238OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4)
1239OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
1240OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2)
1241OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
1242OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
1243
1244
1246
1247inline v_int8x16 v_lut(const schar* tab, const int* idx)
1248{
1249 #if 1
1250 schar CV_DECL_ALIGNED(32) elems[16] =
1251 {
1252 tab[idx[ 0]],
1253 tab[idx[ 1]],
1254 tab[idx[ 2]],
1255 tab[idx[ 3]],
1256 tab[idx[ 4]],
1257 tab[idx[ 5]],
1258 tab[idx[ 6]],
1259 tab[idx[ 7]],
1260 tab[idx[ 8]],
1261 tab[idx[ 9]],
1262 tab[idx[10]],
1263 tab[idx[11]],
1264 tab[idx[12]],
1265 tab[idx[13]],
1266 tab[idx[14]],
1267 tab[idx[15]]
1268 };
1269 return v_int8x16(vle_v_i8m1(elems, 16));
1270 #else
1271 int32xm4_t index32 = vlev_int32xm4(idx, 16);
1272 vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
1273 vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
1274 return v_int8x16(vlxbv_i8m1(tab, index, 16));
1275 #endif
1276}
1277
1278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
1279 schar CV_DECL_ALIGNED(32) elems[16] =
1280 {
1281 tab[idx[0]],
1282 tab[idx[0] + 1],
1283 tab[idx[1]],
1284 tab[idx[1] + 1],
1285 tab[idx[2]],
1286 tab[idx[2] + 1],
1287 tab[idx[3]],
1288 tab[idx[3] + 1],
1289 tab[idx[4]],
1290 tab[idx[4] + 1],
1291 tab[idx[5]],
1292 tab[idx[5] + 1],
1293 tab[idx[6]],
1294 tab[idx[6] + 1],
1295 tab[idx[7]],
1296 tab[idx[7] + 1]
1297 };
1298 return v_int8x16(vle_v_i8m1(elems, 16));
1299}
1300 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1301{
1302 schar CV_DECL_ALIGNED(32) elems[16] =
1303 {
1304 tab[idx[0]],
1305 tab[idx[0] + 1],
1306 tab[idx[0] + 2],
1307 tab[idx[0] + 3],
1308 tab[idx[1]],
1309 tab[idx[1] + 1],
1310 tab[idx[1] + 2],
1311 tab[idx[1] + 3],
1312 tab[idx[2]],
1313 tab[idx[2] + 1],
1314 tab[idx[2] + 2],
1315 tab[idx[2] + 3],
1316 tab[idx[3]],
1317 tab[idx[3] + 1],
1318 tab[idx[3] + 2],
1319 tab[idx[3] + 3]
1320 };
1321 return v_int8x16(vle_v_i8m1(elems, 16));
1322}
1323
1324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1327
1328 inline v_int16x8 v_lut(const short* tab, const int* idx)
1329{
1330 short CV_DECL_ALIGNED(32) elems[8] =
1331 {
1332 tab[idx[0]],
1333 tab[idx[1]],
1334 tab[idx[2]],
1335 tab[idx[3]],
1336 tab[idx[4]],
1337 tab[idx[5]],
1338 tab[idx[6]],
1339 tab[idx[7]]
1340 };
1341 return v_int16x8(vle_v_i16m1(elems, 8));
1342}
1343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1344{
1345 short CV_DECL_ALIGNED(32) elems[8] =
1346 {
1347 tab[idx[0]],
1348 tab[idx[0] + 1],
1349 tab[idx[1]],
1350 tab[idx[1] + 1],
1351 tab[idx[2]],
1352 tab[idx[2] + 1],
1353 tab[idx[3]],
1354 tab[idx[3] + 1]
1355 };
1356 return v_int16x8(vle_v_i16m1(elems, 8));
1357}
1358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1359{
1360 short CV_DECL_ALIGNED(32) elems[8] =
1361 {
1362 tab[idx[0]],
1363 tab[idx[0] + 1],
1364 tab[idx[0] + 2],
1365 tab[idx[0] + 3],
1366 tab[idx[1]],
1367 tab[idx[1] + 1],
1368 tab[idx[1] + 2],
1369 tab[idx[1] + 3]
1370 };
1371 return v_int16x8(vle_v_i16m1(elems, 8));
1372}
1373 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1374 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1375 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1376
1377 inline v_int32x4 v_lut(const int* tab, const int* idx)
1378{
1379 int CV_DECL_ALIGNED(32) elems[4] =
1380 {
1381 tab[idx[0]],
1382 tab[idx[1]],
1383 tab[idx[2]],
1384 tab[idx[3]]
1385 };
1386 return v_int32x4(vle_v_i32m1(elems, 4));
1387}
1388 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1389{
1390 int CV_DECL_ALIGNED(32) elems[4] =
1391 {
1392 tab[idx[0]],
1393 tab[idx[0] + 1],
1394 tab[idx[1]],
1395 tab[idx[1] + 1]
1396 };
1397 return v_int32x4(vle_v_i32m1(elems, 4));
1398}
1399 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1400{
1401 return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
1402}
1403 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1404 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1405 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1406
1407 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1408{
1409 vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1410 return v_int64x2(res);
1411}
1412 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1413{
1414 return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
1415}
1416
1417 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
1418{
1419 vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1420 return v_uint64x2(res);
1421}
1422 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
1423{
1424 return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
1425}
1426
1427 inline v_float32x4 v_lut(const float* tab, const int* idx)
1428{
1429 float CV_DECL_ALIGNED(32) elems[4] =
1430 {
1431 tab[idx[0]],
1432 tab[idx[1]],
1433 tab[idx[2]],
1434 tab[idx[3]]
1435 };
1436 return v_float32x4(vle_v_f32m1(elems, 4));
1437}
1438 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1439{
1440 float CV_DECL_ALIGNED(32) elems[4] =
1441 {
1442 tab[idx[0]],
1443 tab[idx[0]+1],
1444 tab[idx[1]],
1445 tab[idx[1]+1]
1446 };
1447 return v_float32x4(vle_v_f32m1(elems, 4));
1448}
1449 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1450{
1451 return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
1452}
1453 inline v_float64x2 v_lut(const double* tab, const int* idx)
1454{
1455 vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1456 return v_float64x2(res);
1457}
1458 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1459{
1460 return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
1461}
1462
1463 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1464{
1465 int CV_DECL_ALIGNED(32) elems[4] =
1466 {
1467 tab[idxvec.val[0]],
1468 tab[idxvec.val[1]],
1469 tab[idxvec.val[2]],
1470 tab[idxvec.val[3]]
1471 };
1472 return v_int32x4(vle_v_i32m1(elems, 4));
1473}
1474
1475 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1476{
1477 unsigned CV_DECL_ALIGNED(32) elems[4] =
1478 {
1479 tab[idxvec.val[0]],
1480 tab[idxvec.val[1]],
1481 tab[idxvec.val[2]],
1482 tab[idxvec.val[3]]
1483 };
1484 return v_uint32x4(vle_v_u32m1(elems, 4));
1485}
1486
1487 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1488{
1489 float CV_DECL_ALIGNED(32) elems[4] =
1490 {
1491 tab[idxvec.val[0]],
1492 tab[idxvec.val[1]],
1493 tab[idxvec.val[2]],
1494 tab[idxvec.val[3]]
1495 };
1496 return v_float32x4(vle_v_f32m1(elems, 4));
1497}
1498 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1499{
1500 vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1501 return v_float64x2(res);
1502}
1503 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1504{
1505 vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
1506 vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1507
1508 x.val = vlxe_v_f32m1(tab, index_x, 4);
1509 y.val = vlxe_v_f32m1(tab, index_y, 4);
1510}
1511
1512 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1513{
1514 int CV_DECL_ALIGNED(32) idx[4];
1515 v_store_aligned(idx, idxvec);
1516
1517 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1518 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1519}
1520
1521 #define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
1522 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1523 { \
1524 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1525 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1526 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1527 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1528 }\
1529 template<int n> inline \
1530 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1531 { \
1532 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1533 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1534 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1535 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1536 }\
1537 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1538 { \
1539 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1540 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1541 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1542 asm("" ::: "memory"); \
1543 vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1544}\
1545template<int n> inline \
1546void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1547{ \
1548 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1549 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1550 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1551 vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1552}
1553OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
1554OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
1555OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
1556OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
1557OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
1558OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
1559
1560 // pack boolean
1561 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1562{
1563 vuint16m2_t tmp = vundefined_u16m2(); \
1564 tmp = vset_u16m2(tmp, 0, a.val); \
1565 tmp = vset_u16m2(tmp, 1, b.val); \
1566 return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
1567}
1568
1569 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1570 const v_uint32x4& c, const v_uint32x4& d)
1571{
1572 vuint32m4_t vabcd = vundefined_u32m4(); \
1573 vuint16m2_t v16 = vundefined_u16m2(); \
1574 vabcd = vset_u32m4(vabcd, 0, a.val); \
1575 vabcd = vset_u32m4(vabcd, 1, b.val); \
1576 vabcd = vset_u32m4(vabcd, 2, c.val); \
1577 vabcd = vset_u32m4(vabcd, 3, d.val); \
1578 v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
1579 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1580}
1581
1582 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1583 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1584 const v_uint64x2& g, const v_uint64x2& h)
1585{
1586 vuint64m8_t v64 = vundefined_u64m8(); \
1587 vuint32m4_t v32 = vundefined_u32m4(); \
1588 vuint16m2_t v16 = vundefined_u16m2(); \
1589 v64 = vset_u64m8(v64, 0, a.val); \
1590 v64 = vset_u64m8(v64, 1, b.val); \
1591 v64 = vset_u64m8(v64, 2, c.val); \
1592 v64 = vset_u64m8(v64, 3, d.val); \
1593 v64 = vset_u64m8(v64, 4, e.val); \
1594 v64 = vset_u64m8(v64, 5, f.val); \
1595 v64 = vset_u64m8(v64, 6, g.val); \
1596 v64 = vset_u64m8(v64, 7, h.val); \
1597 v32 = vnsrl_vx_u32m4(v64, 0, 16);
1598 v16 = vnsrl_vx_u16m2(v32, 0, 16);
1599 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1600}
1601
1602 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
1603 //{ \
1604 // int16xm2_u tmp; \
1605 // tmp.m1[0] = (vint16m1_t)a.val; \
1606 // tmp.m1[1] = (vint16m1_t)b.val; \
1607 // e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
1608 // return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
1609 //}
1610
1611 #define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
1612 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1613 { \
1614 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1615 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1616 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1617 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1618 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \
1619 } \
1620 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1621 { \
1622 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1623 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1624 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1625 return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \
1626 } \
1627 template<int n> inline \
1628 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1629 { \
1630 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1631 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1632 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1633 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1634 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \
1635 } \
1636 template<int n> inline \
1637 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1638 { \
1639 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1640 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1641 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1642 vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \
1643 return vse_v_u##tp1##m1(ptr, val, num2);\
1644 }
1645OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
1646OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
1647
1648 #ifdef __GNUC__
1649 #pragma GCC diagnostic push
1650 #pragma GCC diagnostic ignored "-Wuninitialized"
1651 #endif
1652
1653 // saturating multiply 8-bit, 16-bit
1654 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \
1655 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
1656 { \
1657 _Tpwvec c, d; \
1658 v_mul_expand(a, b, c, d); \
1659 return v_pack(c, d); \
1660 } \
1661 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
1662 { a = a * b; return a; }
1663
1664OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, v_int16x8)
1665OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
1666OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, v_int32x4)
1667OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
1668
1669 #ifdef __GNUC__
1670 #pragma GCC diagnostic pop
1671 #endif
1672 static const signed char popCountTable[256] =
1673{
1674 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1675 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1676 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1677 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1678 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1679 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1680 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1681 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1682 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1683 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1684 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1685 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1686 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1687 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1688 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1689 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1690};
1691
1692 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
1693 vuint8m1_t v0 = val & 1;
1694 return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
1695}
1696
1697 inline v_uint8x16
1698 v_popcount(const v_uint8x16& a)
1699{
1700 return v_uint8x16(vcnt_u8(a.val));
1701}
1702
1703 inline v_uint8x16
1704 v_popcount(const v_int8x16& a)
1705{
1706 return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
1707}
1708
1709 inline v_uint16x8
1710 v_popcount(const v_uint16x8& a)
1711{
1712 vuint8m2_t tmp = vundefined_u8m2();
1713 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1714 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1715 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1716 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1717 return v_uint16x8(vget_u16m2_u16m1(res, 0));
1718}
1719
1720 inline v_uint16x8
1721 v_popcount(const v_int16x8& a)
1722{
1723 vuint8m2_t tmp = vundefined_u8m2();
1724 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1725 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1726 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1727 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1728 return v_uint16x8(vget_u16m2_u16m1(res, 0));
1729}
1730
1731 inline v_uint32x4
1732 v_popcount(const v_uint32x4& a)
1733{
1734 vuint8m2_t tmp = vundefined_u8m2();
1735 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1736 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1737 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1738 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1739 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1740 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1741 return v_uint32x4(vget_u32m2_u32m1(res, 0));
1742}
1743
1744 inline v_uint32x4
1745 v_popcount(const v_int32x4& a)
1746{
1747 vuint8m2_t tmp = vundefined_u8m2();
1748 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1749 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1750 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1751 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1752 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1753 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1754 return v_uint32x4(vget_u32m2_u32m1(res, 0));
1755}
1756
1757 inline v_uint64x2
1758 v_popcount(const v_uint64x2& a)
1759{
1760 vuint8m2_t tmp = vundefined_u8m2();
1761 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1762 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1763 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1764 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1765 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1766 vuint8m1_t res1 = zero;
1767 vuint8m1_t res2 = zero;
1768 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1769 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1770
1771 return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1772}
1773
1774 inline v_uint64x2
1775 v_popcount(const v_int64x2& a)
1776{
1777 vuint8m2_t tmp = vundefined_u8m2();
1778 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1779 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1780 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1781 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1782 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1783 vuint8m1_t res1 = zero;
1784 vuint8m1_t res2 = zero;
1785 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1786 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1787
1788 return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1789}
1790
1791 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
1792 inline int v_signmask(const v_uint8x16& a)
1793{
1794 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
1795 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1796 vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
1797 vuint32m1_t res = vmv_v_x_u32m1(0, 4);
1798 vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
1799 res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
1800 res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
1801 return vmv_x_s_u32m1_u32(res, 8);
1802}
1803 inline int v_signmask(const v_int8x16& a)
1804{
1805 vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
1806 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1807 vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
1808 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1809 vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
1810 res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
1811 res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
1812 return vmv_x_s_i32m1_i32(res, 8);
1813}
1814
1815 inline int v_signmask(const v_int16x8& a)
1816{
1817 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1818 vint16m1_t m1 = (vint16m1_t){SMASK};
1819 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1820 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1821 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1822 return vmv_x_s_i16m1_i16(res, 8);
1823}
1824 inline int v_signmask(const v_uint16x8& a)
1825{
1826 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1827 vint16m1_t m1 = (vint16m1_t){SMASK};
1828 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1829 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1830 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1831 return vmv_x_s_i16m1_i16(res, 8);
1832}
1833 inline int v_signmask(const v_int32x4& a)
1834{
1835 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1836 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1837 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1838 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1839 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1840 return vmv_x_s_i32m1_i32(res, 4);
1841}
1842 inline int v_signmask(const v_uint32x4& a)
1843{
1844 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
1845 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1846 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1847 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1848 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1849 return vmv_x_s_i32m1_i32(res, 4);
1850}
1851 inline int v_signmask(const v_uint64x2& a)
1852{
1853 vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
1854 int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
1855 return res;
1856}
1857 inline int v_signmask(const v_int64x2& a)
1858{ return v_signmask(v_reinterpret_as_u64(a)); }
1859 inline int v_signmask(const v_float64x2& a)
1860{ return v_signmask(v_reinterpret_as_u64(a)); }
1861 inline int v_signmask(const v_float32x4& a)
1862{
1863 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1864 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1865 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1866 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1867 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1868 return vmv_x_s_i32m1_i32(res, 4);
1869}
1870
1871 inline int v_scan_forward(const v_int8x16& a) {
1872 int val = v_signmask(a);
1873 if(val==0) return 0;
1874 else return trailingZeros32(val); }
1875 inline int v_scan_forward(const v_uint8x16& a) {
1876 int val = v_signmask(a);
1877 if(val==0) return 0;
1878 else return trailingZeros32(val); }
1879 inline int v_scan_forward(const v_int16x8& a) {
1880 int val = v_signmask(a);
1881 if(val==0) return 0;
1882 else return trailingZeros32(val); }
1883 inline int v_scan_forward(const v_uint16x8& a) {
1884 int val = v_signmask(a);
1885 if(val==0) return 0;
1886 else return trailingZeros32(val); }
1887 inline int v_scan_forward(const v_int32x4& a) {
1888 int val = v_signmask(a);
1889 if(val==0) return 0;
1890 else return trailingZeros32(val); }
1891 inline int v_scan_forward(const v_uint32x4& a) {
1892 int val = v_signmask(a);
1893 if(val==0) return 0;
1894 else return trailingZeros32(val); }
1895 inline int v_scan_forward(const v_float32x4& a) {
1896 int val = v_signmask(a);
1897 if(val==0) return 0;
1898 else return trailingZeros32(val); }
1899 inline int v_scan_forward(const v_int64x2& a) {
1900 int val = v_signmask(a);
1901 if(val==0) return 0;
1902 else return trailingZeros32(val); }
1903 inline int v_scan_forward(const v_uint64x2& a) {
1904 int val = v_signmask(a);
1905 if(val==0) return 0;
1906 else return trailingZeros32(val); }
1907
1908 #define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
1909 inline bool v_check_all(const v_##_Tpvec& a) \
1910 { \
1911 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
1912 vuint64m1_t v1 = vuint64m1_t(v0); \
1913 return (v1[0] | v1[1]) == 0; \
1914 } \
1915 inline bool v_check_any(const v_##_Tpvec& a) \
1916 { \
1917 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
1918 vuint64m1_t v1 = vuint64m1_t(v0); \
1919 return (v1[0] | v1[1]) != 0; \
1920 }
1921
1922OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16)
1923OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
1924OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
1925OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
1926
1927inline bool v_check_all(const v_int8x16& a)
1928{ return v_check_all(v_reinterpret_as_u8(a)); }
1929 inline bool v_check_all(const v_int16x8& a)
1930{ return v_check_all(v_reinterpret_as_u16(a)); }
1931 inline bool v_check_all(const v_int32x4& a)
1932{ return v_check_all(v_reinterpret_as_u32(a)); }
1933 inline bool v_check_all(const v_float32x4& a)
1934{ return v_check_all(v_reinterpret_as_u32(a)); }
1935 inline bool v_check_all(const v_int64x2& a)
1936{ return v_check_all(v_reinterpret_as_u64(a)); }
1937 inline bool v_check_all(const v_float64x2& a)
1938{ return v_check_all(v_reinterpret_as_u64(a)); }
1939
1940 inline bool v_check_any(const v_int8x16& a)
1941{ return v_check_any(v_reinterpret_as_u8(a)); }
1942 inline bool v_check_any(const v_int16x8& a)
1943{ return v_check_any(v_reinterpret_as_u16(a)); }
1944 inline bool v_check_any(const v_int32x4& a)
1945{ return v_check_any(v_reinterpret_as_u32(a)); }
1946 inline bool v_check_any(const v_float32x4& a)
1947{ return v_check_any(v_reinterpret_as_u32(a)); }
1948 inline bool v_check_any(const v_int64x2& a)
1949{ return v_check_any(v_reinterpret_as_u64(a)); }
1950 inline bool v_check_any(const v_float64x2& a)
1951{ return v_check_any(v_reinterpret_as_u64(a)); }
1952
1953 #define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
1954 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1955 { \
1956 return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
1957 }
1958
1959OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16)
1960OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8)
1961OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4)
1962OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
1963OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
1964OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
1965inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
1966{
1967 return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
1968}
1969 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
1970{
1971 return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
1972}
1973
1974 #define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
1975 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
1976 { \
1977 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1978 b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \
1979 b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \
1980 } \
1981 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
1982 { \
1983 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \
1984 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1985 } \
1986 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
1987 { \
1988 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1989 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
1990 } \
1991 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
1992 { \
1993 _T2##_t val = vle##_v_##_Tp1(ptr, num2); \
1994 _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \
1995 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1996 }
1997
1998OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
1999OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1)
2000OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1)
2001OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1)
2002OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1)
2003OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1)
2004
2005 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2006{
2007 vuint16m2_t b = vundefined_u16m2();
2008 vuint32m2_t c = vundefined_u32m2();
2009 vuint8m1_t val = vle_v_u8m1(ptr, 4); \
2010 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2011 c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2012 return v_uint32x4(vget_u32m2_u32m1(c, 0));
2013}
2014
2015 inline v_int32x4 v_load_expand_q(const schar* ptr)
2016{
2017 vint16m2_t b = vundefined_i16m2();
2018 vint32m2_t c = vundefined_i32m2();
2019 vint8m1_t val = vle_v_i8m1(ptr, 4); \
2020 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2021 c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2022 return v_int32x4(vget_i32m2_i32m1(c, 0));
2023}
2024 #define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
2025 #define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
2026 #define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
2027 #define VITL_2 (vuint64m2_t){0, 2, 1, 3}
2028 #define LOW_4 0x0000000100000000, 0x0000000500000004
2029 #define LOW_8 0x0003000200010000, 0x000B000A00090008
2030 #define LOW_16 0x0706050403020100, 0x1716151413121110
2031 #define HIGH_4 0x0000000300000002, 0x0000000700000006
2032 #define HIGH_8 0x0007000600050004, 0x000F000E000D000C
2033 #define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918
2034 #define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
2035 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2036 { \
2037 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2038 tmp = vset_##_T##m2(tmp, 0, a0.val); \
2039 tmp = vset_##_T##m2(tmp, 1, a1.val); \
2040 vuint64m2_t mask = VITL_##num; \
2041 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
2042 b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
2043 b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
2044 } \
2045 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2046 { \
2047 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2048 return v_##_Tpvec(b0);\
2049 } \
2050 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2051 { \
2052 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2053 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2054 v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2055 return v_##_Tpvec(b1);\
2056 } \
2057 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2058 { \
2059 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2060 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2061 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2062 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2063 }
2064
2065OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
2066OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
2067OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
2068OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
2069OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
2070OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
2071OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
2072OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
2073
2074inline v_uint8x16 v_reverse(const v_uint8x16 &a)
2075{
2076 vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2077 return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
2078}
2079 inline v_int8x16 v_reverse(const v_int8x16 &a)
2080{
2081 vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2082 return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
2083}
2084
2085 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
2086{
2087 vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
2088 return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
2089}
2090
2091 inline v_int16x8 v_reverse(const v_int16x8 &a)
2092{
2093 vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
2094 return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
2095}
2096 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
2097{
2098 return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2099}
2100
2101 inline v_int32x4 v_reverse(const v_int32x4 &a)
2102{
2103 return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2104}
2105
2106 inline v_float32x4 v_reverse(const v_float32x4 &a)
2107{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2108
2109 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2110{
2111 return v_uint64x2(a.val[1], a.val[0]);
2112}
2113
2114 inline v_int64x2 v_reverse(const v_int64x2 &a)
2115{
2116 return v_int64x2(a.val[1], a.val[0]);
2117}
2118
2119 inline v_float64x2 v_reverse(const v_float64x2 &a)
2120{
2121 return v_float64x2(a.val[1], a.val[0]);
2122}
2123
2124 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2125 template <int n> \
2126 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2127 { return v_rotate_right<n>(a, b);}
2128OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
2129OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
2130OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
2131OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
2132OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
2133OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
2134OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
2135OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
2136OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
2137OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
2138
2139
2140 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
2141 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
2142
2143OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
2144OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
2145OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
2146OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
2147OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
2148OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
2149OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
2150OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
2151OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
2152OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
2153
2154 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2155 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2156
2157OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
2158OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
2159OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
2160OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
2161OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
2162OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
2163OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
2164OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
2165OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
2166inline v_int32x4 v_round(const v_float32x4& a)
2167{
2168 __builtin_riscv_fsrm(0);
2169 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2170 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2171 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2172 __builtin_riscv_fsrm(0);
2173 return v_int32x4(val);
2174}
2175 inline v_int32x4 v_floor(const v_float32x4& a)
2176{
2177 __builtin_riscv_fsrm(2);
2178 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2179 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2180 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2181 __builtin_riscv_fsrm(0);
2182 return v_int32x4(val);
2183}
2184
2185 inline v_int32x4 v_ceil(const v_float32x4& a)
2186{
2187 __builtin_riscv_fsrm(3);
2188 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2189 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2190 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2191 __builtin_riscv_fsrm(0);
2192 return v_int32x4(val);
2193}
2194
2195 inline v_int32x4 v_trunc(const v_float32x4& a)
2196{
2197 __builtin_riscv_fsrm(1);
2198 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2199 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2200 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2201 __builtin_riscv_fsrm(0);
2202 return v_int32x4(val);
2203}
2204
2205 inline v_int32x4 v_round(const v_float64x2& a)
2206{
2207 __builtin_riscv_fsrm(0);
2208 vfloat64m2_t _val = vundefined_f64m2();
2209 _val = vset_f64m2(_val, 0, a.val);
2210 //_val = vset_f64m2(_val, 1, a.val);
2211 _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2212 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2213 __builtin_riscv_fsrm(0);
2214 return v_int32x4(val);
2215}
2216 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2217{
2218 __builtin_riscv_fsrm(0);
2219 vfloat64m2_t _val = vundefined_f64m2();
2220 _val = vset_f64m2(_val, 0, a.val);
2221 _val = vset_f64m2(_val, 1, b.val);
2222 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2223 __builtin_riscv_fsrm(0);
2224 return v_int32x4(val);
2225}
2226 inline v_int32x4 v_floor(const v_float64x2& a)
2227{
2228 __builtin_riscv_fsrm(2);
2229 vfloat64m2_t _val = vundefined_f64m2();
2230 _val = vset_f64m2(_val, 0, a.val);
2231 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2232
2233 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2234 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2235 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2236 __builtin_riscv_fsrm(0);
2237 return v_int32x4(val);
2238}
2239
2240 inline v_int32x4 v_ceil(const v_float64x2& a)
2241{
2242 __builtin_riscv_fsrm(3);
2243 vfloat64m2_t _val = vundefined_f64m2();
2244 _val = vset_f64m2(_val, 0, a.val);
2245 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2246
2247 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2248 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2249 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2250 __builtin_riscv_fsrm(0);
2251 return v_int32x4(val);
2252}
2253
2254 inline v_int32x4 v_trunc(const v_float64x2& a)
2255{
2256 __builtin_riscv_fsrm(1);
2257 vfloat64m2_t _val = vundefined_f64m2();
2258 _val = vset_f64m2(_val, 0, a.val);
2259 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2260
2261 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2262 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2263 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2264 __builtin_riscv_fsrm(0);
2265 return v_int32x4(val);
2266}
2267
2268 #define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2269 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2270 { \
2271 v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
2272 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2273 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2274 } \
2275 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2276 { \
2277 v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
2278 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2279 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2280 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2281 }\
2282 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2283 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2284 { \
2285 v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
2286 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2287 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2288 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2289 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2290 } \
2291
2292 #define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2293 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2294 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2295 { \
2296 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2297 ret = vset_##_T##m1x2(ret, 0, a.val); \
2298 ret = vset_##_T##m1x2(ret, 1, b.val); \
2299 intrin##2e_v_##_T##m1x2(ptr, ret, num); \
2300 } \
2301 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2302 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2303 { \
2304 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2305 ret = vset_##_T##m1x3(ret, 0, a.val); \
2306 ret = vset_##_T##m1x3(ret, 1, b.val); \
2307 ret = vset_##_T##m1x3(ret, 2, c.val); \
2308 intrin##3e_v_##_T##m1x3(ptr, ret, num); \
2309 } \
2310 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2311 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2312 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED ) \
2313 { \
2314 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2315 ret = vset_##_T##m1x4(ret, 0, a.val); \
2316 ret = vset_##_T##m1x4(ret, 1, b.val); \
2317 ret = vset_##_T##m1x4(ret, 2, c.val); \
2318 ret = vset_##_T##m1x4(ret, 3, d.val); \
2319 intrin##4e_v_##_T##m1x4(ptr, ret, num); \
2320 }
2321
2322 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
2323 OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \
2324 OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
2325
2326 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
2327OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
2328OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
2329OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
2330
2331OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
2332OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
2333OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
2334
2335 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
2336 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2337 { \
2338 v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
2339 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2340 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2341 } \
2342 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2343 { \
2344 v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \
2345 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2346 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2347 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2348 }\
2349 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2350 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2351 { \
2352 v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \
2353 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2354 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2355 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2356 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2357 } \
2358 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2359 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2360 { \
2361 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2362 ret = vset_##_T##m1x2(ret, 0, a.val); \
2363 ret = vset_##_T##m1x2(ret, 1, b.val); \
2364 vsseg2e_v_##_T##m1x2(ptr, ret, num); \
2365 } \
2366 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2367 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2368 { \
2369 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2370 ret = vset_##_T##m1x3(ret, 0, a.val); \
2371 ret = vset_##_T##m1x3(ret, 1, b.val); \
2372 ret = vset_##_T##m1x3(ret, 2, c.val); \
2373 vsseg3e_v_##_T##m1x3(ptr, ret, num); \
2374 } \
2375 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2376 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2377 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED ) \
2378 { \
2379 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2380 ret = vset_##_T##m1x4(ret, 0, a.val); \
2381 ret = vset_##_T##m1x4(ret, 1, b.val); \
2382 ret = vset_##_T##m1x4(ret, 2, c.val); \
2383 ret = vset_##_T##m1x4(ret, 3, d.val); \
2384 vsseg4e_v_##_T##m1x4(ptr, ret, num); \
2385 }
2386OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
2387OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
2388
2389OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
2390OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
2391
2392 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2393{
2394 return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
2395}
2396
2397 #if CV_SIMD128_64F
2398 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2399{
2400 vfloat64m2_t _val = vundefined_f64m2();
2401 _val = vset_f64m2(_val, 0, a.val);
2402 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2403 return v_float32x4(aval);
2404}
2405
2406 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2407{
2408 vfloat64m2_t _val = vundefined_f64m2();
2409 _val = vset_f64m2(_val, 0, a.val);
2410 _val = vset_f64m2(_val, 1, b.val);
2411 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
2412 return v_float32x4(aval);
2413}
2414
2415 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2416{
2417 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2418 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2419 return v_float64x2(vget_f64m2_f64m1(_val, 0));
2420}
2421
2422 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2423{
2424 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2425 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2426 return v_float64x2(vget_f64m2_f64m1(_val, 1));
2427}
2428
2429 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2430{
2431 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2432 return v_float64x2(vget_f64m2_f64m1(_val, 0));
2433}
2434
2435 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2436{
2437 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2438 return v_float64x2(vget_f64m2_f64m1(_val, 1));
2439}
2440
2441 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2442{
2443 return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
2444}
2445
2446 #endif
2447 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2448{
2449 vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2450 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2451}
2452 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
2453{
2454 return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2455}
2456
2457 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2458{
2459 vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2460 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2461}
2462 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
2463{
2464 return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2465}
2466
2467 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2468{
2469 vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2470 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
2471}
2472 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2473 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2474{
2475 vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2476 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2477}
2478 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2479
2480 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2481{
2482 vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2483 return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2484}
2485 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2486 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2487 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2488{
2489 vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2490 return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2491}
2492 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2493
2494 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2495{
2496 vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2497 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2498}
2499 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2500
2501 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2502 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2503 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2504
2505 #if CV_SIMD128_64F
2506 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2507{ return v_cvt_f64(v_dotprod(a, b)); }
2508 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2509 const v_float64x2& c)
2510{ return v_dotprod_expand(a, b) + c; }
2511 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2512{
2513 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2514 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
2515 return v_float64x2(res);
2516}
2517 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2519 return res + c; }
2520 #endif
2522 inline v_float32x4 v_load_expand(const float16_t* ptr)
2523{
2524 vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
2525 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2526 return v_float32x4(vget_f32m2_f32m1(v32, 0));
2527}
2528
2529 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2530{
2531 vfloat32m2_t v32 = vundefined_f32m2();
2532 v32 = vset_f32m2(v32, 0, v.val);
2533 vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
2534 vse_v_f16m1((__fp16*)ptr, hv, 4);
2535}
2536
2537
2538 inline void v_cleanup() {}
2539
2540CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2541
2543
2544}
2545 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1454
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75