OpenCV 4.5.3(日本語機械翻訳)
intrin_rvv.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 // The original implementation has been contributed by Yin Zhang.
6 // Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
7
8 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
9 #define OPENCV_HAL_INTRIN_RVV_HPP
10
11 #include <algorithm>
12
13 namespace cv
14{
15
16CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
17
18 #define CV_SIMD128 1
19 #define CV_SIMD128_64F 1
20
22
24{
25 uchar val[8] = {0};
26 vuint8mf2_t() {}
27 vuint8mf2_t(const uchar* ptr)
28 {
29 for (int i = 0; i < 8; ++i)
30 {
31 val[i] = ptr[i];
32 }
33 }
34};
35 struct vint8mf2_t
36{
37 schar val[8] = {0};
38 vint8mf2_t() {}
39 vint8mf2_t(const schar* ptr)
40 {
41 for (int i = 0; i < 8; ++i)
42 {
43 val[i] = ptr[i];
44 }
45 }
46};
48{
49 ushort val[4] = {0};
50 vuint16mf2_t() {}
51 vuint16mf2_t(const ushort* ptr)
52 {
53 for (int i = 0; i < 4; ++i)
54 {
55 val[i] = ptr[i];
56 }
57 }
58};
60{
61 short val[4] = {0};
62 vint16mf2_t() {}
63 vint16mf2_t(const short* ptr)
64 {
65 for (int i = 0; i < 4; ++i)
66 {
67 val[i] = ptr[i];
68 }
69 }
70};
72{
73 unsigned val[2] = {0};
74 vuint32mf2_t() {}
75 vuint32mf2_t(const unsigned* ptr)
76 {
77 val[0] = ptr[0];
78 val[1] = ptr[1];
79 }
80};
82{
83 int val[2] = {0};
84 vint32mf2_t() {}
85 vint32mf2_t(const int* ptr)
86 {
87 val[0] = ptr[0];
88 val[1] = ptr[1];
89 }
90};
92{
93 float val[2] = {0};
94 vfloat32mf2_t() {}
95 vfloat32mf2_t(const float* ptr)
96 {
97 val[0] = ptr[0];
98 val[1] = ptr[1];
99 }
100};
102{
103 uint64 val[1] = {0};
104 vuint64mf2_t() {}
105 vuint64mf2_t(const uint64* ptr)
106 {
107 val[0] = ptr[0];
108 }
109};
111{
112 int64 val[1] = {0};
113 vint64mf2_t() {}
114 vint64mf2_t(const int64* ptr)
115 {
116 val[0] = ptr[0];
117 }
118};
120{
121 double val[1] = {0};
122 vfloat64mf2_t() {}
123 vfloat64mf2_t(const double* ptr)
124 {
125 val[0] = ptr[0];
126 }
127};
129{
130 uchar val[4] = {0};
131 vuint8mf4_t() {}
132 vuint8mf4_t(const uchar* ptr)
133 {
134 for (int i = 0; i < 4; ++i)
135 {
136 val[i] = ptr[i];
137 }
138 }
139};
141{
142 schar val[4] = {0};
143 vint8mf4_t() {}
144 vint8mf4_t(const schar* ptr)
145 {
146 for (int i = 0; i < 4; ++i)
147 {
148 val[i] = ptr[i];
149 }
150 }
151};
152
153 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
154 inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
155 { \
156 return _Tpvec(ptr); \
157 } \
158 inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
159 { \
160 for (int i = 0; i < n; ++i) \
161 { \
162 ptr[i] = v.val[i]; \
163 } \
164 }
165
166OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
167OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
168OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
169OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
170OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
171OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
172OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
173OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
174OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
175OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
176
177
178 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
179 inline _Tpwvec wcvt (_Tpvec v) \
180 { \
181 _wTp tmp[n]; \
182 for (int i = 0; i < n; ++i) \
183 { \
184 tmp[i] = (_wTp)v.val[i]; \
185 } \
186 vsetvlmax_e##width##m1(); \
187 return vle##width##_v_##suffix##m1(tmp); \
188 }
189
190OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
191OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
192OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
193OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
194OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
195OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
196
197inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
198{
199 return vuint8mf4_t(base);
200}
201 inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base)
202{
203 return vint8mf4_t(base);
204}
205
206 inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
207{
208 ushort tmp[4];
209 for (int i = 0; i < 4; ++i)
210 {
211 tmp[i] = (ushort)src.val[i];
212 }
213 return vle16_v_u16mf2(tmp);
214}
215 inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
216{
217 short tmp[4];
218 for (int i = 0; i < 4; ++i)
219 {
220 tmp[i] = (short)src.val[i];
221 }
222 return vle16_v_i16mf2(tmp);
223}
224
226
228{
229 typedef uchar lane_type;
230 enum { nlanes = 16 };
231
232 v_uint8x16() {}
233 explicit v_uint8x16(vuint8m1_t v)
234 {
235 vsetvlmax_e8m1();
236 vse8_v_u8m1(val, v);
237 }
238 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
239 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
240 {
241 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
242 for (int i = 0; i < nlanes; ++i)
243 {
244 val[i] = v[i];
245 }
246 }
247 operator vuint8m1_t() const
248 {
249 vsetvlmax_e8m1();
250 return vle8_v_u8m1(val);
251 }
252 uchar get0() const
253 {
254 return val[0];
255 }
256
257 uchar val[16];
258};
259
260 struct v_int8x16
261{
262 typedef schar lane_type;
263 enum { nlanes = 16 };
264
265 v_int8x16() {}
266 explicit v_int8x16(vint8m1_t v)
267 {
268 vsetvlmax_e8m1();
269 vse8_v_i8m1(val, v);
270 }
271 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
272 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
273 {
274 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
275 for (int i = 0; i < nlanes; ++i)
276 {
277 val[i] = v[i];
278 }
279 }
280 operator vint8m1_t() const
281 {
282 vsetvlmax_e8m1();
283 return vle8_v_i8m1(val);
284 }
285 schar get0() const
286 {
287 return val[0];
288 }
289
290 schar val[16];
291};
292
294{
295 typedef ushort lane_type;
296 enum { nlanes = 8 };
297
298 v_uint16x8() {}
299 explicit v_uint16x8(vuint16m1_t v)
300 {
301 vsetvlmax_e16m1();
302 vse16_v_u16m1(val, v);
303 }
304 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
305 {
306 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
307 for (int i = 0; i < nlanes; ++i)
308 {
309 val[i] = v[i];
310 }
311 }
312 operator vuint16m1_t() const
313 {
314 vsetvlmax_e16m1();
315 return vle16_v_u16m1(val);
316 }
317 ushort get0() const
318 {
319 return val[0];
320 }
321
322 ushort val[8];
323};
324
325 struct v_int16x8
326{
327 typedef short lane_type;
328 enum { nlanes = 8 };
329
330 v_int16x8() {}
331 explicit v_int16x8(vint16m1_t v)
332 {
333 vsetvlmax_e16m1();
334 vse16_v_i16m1(val, v);
335 }
336 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
337 {
338 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
339 for (int i = 0; i < nlanes; ++i)
340 {
341 val[i] = v[i];
342 }
343 }
344 operator vint16m1_t() const
345 {
346 vsetvlmax_e16m1();
347 return vle16_v_i16m1(val);
348 }
349 short get0() const
350 {
351 return val[0];
352 }
353
354 short val[8];
355};
356
358{
359 typedef unsigned lane_type;
360 enum { nlanes = 4 };
361
362 v_uint32x4() {}
363 explicit v_uint32x4(vuint32m1_t v)
364 {
365 vsetvlmax_e32m1();
366 vse32_v_u32m1(val, v);
367 }
368 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
369 {
370 unsigned v[] = {v0, v1, v2, v3};
371 for (int i = 0; i < nlanes; ++i)
372 {
373 val[i] = v[i];
374 }
375 }
376 operator vuint32m1_t() const
377 {
378 vsetvlmax_e32m1();
379 return vle32_v_u32m1(val);
380 }
381 unsigned get0() const
382 {
383 return val[0];
384 }
385
386 unsigned val[4];
387};
388
389 struct v_int32x4
390{
391 typedef int lane_type;
392 enum { nlanes = 4 };
393
394 v_int32x4() {}
395 explicit v_int32x4(vint32m1_t v)
396 {
397 vsetvlmax_e32m1();
398 vse32_v_i32m1(val, v);
399 }
400 v_int32x4(int v0, int v1, int v2, int v3)
401 {
402 int v[] = {v0, v1, v2, v3};
403 for (int i = 0; i < nlanes; ++i)
404 {
405 val[i] = v[i];
406 }
407 }
408 operator vint32m1_t() const
409 {
410 vsetvlmax_e32m1();
411 return vle32_v_i32m1(val);
412 }
413 int get0() const
414 {
415 return val[0];
416 }
417 int val[4];
418};
419
421{
422 typedef float lane_type;
423 enum { nlanes = 4 };
424
425 v_float32x4() {}
426 explicit v_float32x4(vfloat32m1_t v)
427 {
428 vsetvlmax_e32m1();
429 vse32_v_f32m1(val, v);
430 }
431 v_float32x4(float v0, float v1, float v2, float v3)
432 {
433 float v[] = {v0, v1, v2, v3};
434 for (int i = 0; i < nlanes; ++i)
435 {
436 val[i] = v[i];
437 }
438 }
439 operator vfloat32m1_t() const
440 {
441 vsetvlmax_e32m1();
442 return vle32_v_f32m1(val);
443 }
444 float get0() const
445 {
446 return val[0];
447 }
448 float val[4];
449};
450
452{
453 typedef uint64 lane_type;
454 enum { nlanes = 2 };
455
456 v_uint64x2() {}
457 explicit v_uint64x2(vuint64m1_t v)
458 {
459 vsetvlmax_e64m1();
460 vse64_v_u64m1(val, v);
461 }
462 v_uint64x2(uint64 v0, uint64 v1)
463 {
464 uint64 v[] = {v0, v1};
465 for (int i = 0; i < nlanes; ++i)
466 {
467 val[i] = v[i];
468 }
469 }
470 operator vuint64m1_t() const
471 {
472 vsetvlmax_e64m1();
473 return vle64_v_u64m1(val);
474 }
475 uint64 get0() const
476 {
477 return val[0];
478 }
479
480 uint64 val[2];
481};
482
483 struct v_int64x2
484{
485 typedef int64 lane_type;
486 enum { nlanes = 2 };
487
488 v_int64x2() {}
489 explicit v_int64x2(vint64m1_t v)
490 {
491 vsetvlmax_e64m1();
492 vse64_v_i64m1(val, v);
493 }
494 v_int64x2(int64 v0, int64 v1)
495 {
496 int64 v[] = {v0, v1};
497 for (int i = 0; i < nlanes; ++i)
498 {
499 val[i] = v[i];
500 }
501 }
502 operator vint64m1_t() const
503 {
504 vsetvlmax_e64m1();
505 return vle64_v_i64m1(val);
506 }
507 int64 get0() const
508 {
509 return val[0];
510 }
511
512 int64 val[2];
513};
514
515 #if CV_SIMD128_64F
517{
518 typedef double lane_type;
519 enum { nlanes = 2 };
520
521 v_float64x2() {}
522 explicit v_float64x2(vfloat64m1_t v)
523 {
524 vsetvlmax_e64m1();
525 vse64_v_f64m1(val, v);
526 }
527 v_float64x2(double v0, double v1)
528 {
529 double v[] = {v0, v1};
530 for (int i = 0; i < nlanes; ++i)
531 {
532 val[i] = v[i];
533 }
534 }
535 operator vfloat64m1_t() const
536 {
537 vsetvlmax_e64m1();
538 return vle64_v_f64m1(val);
539 }
540 double get0() const
541 {
542 return val[0];
543 }
544
545 double val[2];
546};
547 #endif
548
549
551
552 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
553 inline v_##_Tpvec v_setzero_##suffix1() \
554 { \
555 vsetvlmax_e##width##m1(); \
556 return v_##_Tpvec(vzero_##suffix2##m1()); \
557 } \
558 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
559 { \
560 vsetvlmax_e##width##m1(); \
561 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
562 }
563
564OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
565OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
566OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
567OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16)
568OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32)
569OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32)
570OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
571OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
572
573 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
574 inline v_##_Tpv v_setzero_##suffix() \
575 { \
576 vsetvlmax_e##width##m1(); \
577 return v_##_Tpv(vzero_##suffix##m1()); \
578 } \
579 inline v_##_Tpv v_setall_##suffix(_Tp v) \
580 { \
581 vsetvlmax_e##width##m1(); \
582 return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
583 }
584
585OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32)
586 #if CV_SIMD128_64F
587OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64)
588 #endif
589
591
592 #define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
593 inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
594
595OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
596OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
597OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
598OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
599OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
600OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
601OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
602OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
603OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
604 #if CV_SIMD128_64F
605OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
606 #endif
607
608 #define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
609 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
610 { \
611 vsetvlmax_e##width2##m1(); \
612 return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
613 } \
614 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
615 { \
616 vsetvlmax_e##width1##m1(); \
617 return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
618 }
619
620OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
621OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
622OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
623OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
624OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
625OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
626OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
627OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
628OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
629OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
630OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
631OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
632OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
633OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
634OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
635OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
636OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
637OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
638OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
639OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
640OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
641OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
642OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
643OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
644OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
645OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
646OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
647OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
648OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
649OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
650OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
651OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
652OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
653OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
654OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
655OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
656 #if CV_SIMD128_64F
657OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
658OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
659OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
660OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
661OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
662OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
663OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
664OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
665OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
666 #endif
667
669
670 #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
671 template <int s> \
672 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
673 { \
674 vsetvlmax_e##width##m1(); \
675 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
676 } \
677 template<int i> inline _Tp v_extract_n(_Tpvec v) \
678 { \
679 vsetvlmax_e##width##m1(); \
680 return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
681 }
682
683
684OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
685OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
686OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
687OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16)
688OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
689OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32)
690OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
691OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
692OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32)
693 #if CV_SIMD128_64F
694OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64)
695 #endif
696
698
699 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
700 inline _Tpvec v_load(const _Tp* ptr) \
701 { \
702 vsetvlmax_e8m1(); \
703 return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
704 } \
705 inline _Tpvec v_load_aligned(const _Tp* ptr) \
706 { \
707 vsetvlmax_e##width##m1(); \
708 return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
709 } \
710 inline _Tpvec v_load_low(const _Tp* ptr) \
711 { \
712 vsetvl_e##width##m1(hvl); \
713 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
714 vsetvlmax_e##width##m1(); \
715 return res; \
716 } \
717 inline void v_store(_Tp* ptr, const _Tpvec& a) \
718 { \
719 vsetvlmax_e8m1(); \
720 vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
721 } \
722 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
723 { \
724 vsetvlmax_e##width##m1(); \
725 vse##width##_v_##suffix##m1(ptr, a); \
726 } \
727 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
728 { \
729 vsetvlmax_e##width##m1(); \
730 vse##width##_v_##suffix##m1(ptr, a); \
731 } \
732 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/ ) \
733 { \
734 vsetvlmax_e##width##m1(); \
735 vse##width##_v_##suffix##m1(ptr, a); \
736 } \
737 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
738 { \
739 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
740 vsetvlmax_e##width##m1(); \
741 vse##width##_v_##suffix##m1(tmp_ptr, a); \
742 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
743 { \
744 ptr[i] = tmp_ptr[i]; \
745 } \
746 } \
747 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
748 { \
749 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
750 vsetvlmax_e##width##m1(); \
751 vse##width##_v_##suffix##m1(tmp_ptr, a); \
752 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
753 { \
754 ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
755 } \
756 }
757
758OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
759OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8)
760OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
761OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16)
762OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32)
763OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32)
764OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
765OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64)
766OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32)
767 #if CV_SIMD128_64F
768OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64)
769 #endif
770
771 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
772{
773 schar CV_DECL_ALIGNED(32) elems[16] =
774 {
775 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
776 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
777 };
778 vsetvlmax_e8m1();
779 return v_int8x16(vle8_v_i8m1(elems));
780}
781 inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
782
783 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
784{
785 short CV_DECL_ALIGNED(32) elems[8] =
786 {
787 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
788 };
789 vsetvlmax_e16m1();
790 return v_int16x8(vle16_v_i16m1(elems));
791}
792 inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
793
794 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
795{
796 int CV_DECL_ALIGNED(32) elems[4] =
797 {
798 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
799 };
800 vsetvlmax_e32m1();
801 return v_int32x4(vle32_v_i32m1(elems));
802}
803 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
804{
805 float CV_DECL_ALIGNED(32) elems[4] =
806 {
807 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
808 };
809 vsetvlmax_e32m1();
810 return v_float32x4(vle32_v_f32m1(elems));
811}
812 inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
813
814 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
815{
816 int64 CV_DECL_ALIGNED(32) elems[2] =
817 {
818 ptr0[0], ptr1[0]
819 };
820 vsetvlmax_e64m1();
821 return v_int64x2(vle64_v_i64m1(elems));
822}
823 inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
824
825 #if CV_SIMD128_64F
826 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
827{
828 double CV_DECL_ALIGNED(32) elems[2] =
829 {
830 ptr0[0], ptr1[0]
831 };
832 vsetvlmax_e64m1();
833 return v_float64x2(vle64_v_f64m1(elems));
834}
835 #endif
836
837
839
840 inline v_int8x16 v_lut(const schar* tab, const int* idx)
841{
842 schar CV_DECL_ALIGNED(32) elems[16] =
843 {
844 tab[idx[ 0]],
845 tab[idx[ 1]],
846 tab[idx[ 2]],
847 tab[idx[ 3]],
848 tab[idx[ 4]],
849 tab[idx[ 5]],
850 tab[idx[ 6]],
851 tab[idx[ 7]],
852 tab[idx[ 8]],
853 tab[idx[ 9]],
854 tab[idx[10]],
855 tab[idx[11]],
856 tab[idx[12]],
857 tab[idx[13]],
858 tab[idx[14]],
859 tab[idx[15]]
860 };
861 vsetvlmax_e8m1();
862 return v_int8x16(vle8_v_i8m1(elems));
863}
864 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
865{
866 schar CV_DECL_ALIGNED(32) elems[16] =
867 {
868 tab[idx[0]],
869 tab[idx[0] + 1],
870 tab[idx[1]],
871 tab[idx[1] + 1],
872 tab[idx[2]],
873 tab[idx[2] + 1],
874 tab[idx[3]],
875 tab[idx[3] + 1],
876 tab[idx[4]],
877 tab[idx[4] + 1],
878 tab[idx[5]],
879 tab[idx[5] + 1],
880 tab[idx[6]],
881 tab[idx[6] + 1],
882 tab[idx[7]],
883 tab[idx[7] + 1]
884 };
885 vsetvlmax_e8m1();
886 return v_int8x16(vle8_v_i8m1(elems));
887}
888 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
889{
890 schar CV_DECL_ALIGNED(32) elems[16] =
891 {
892 tab[idx[0]],
893 tab[idx[0] + 1],
894 tab[idx[0] + 2],
895 tab[idx[0] + 3],
896 tab[idx[1]],
897 tab[idx[1] + 1],
898 tab[idx[1] + 2],
899 tab[idx[1] + 3],
900 tab[idx[2]],
901 tab[idx[2] + 1],
902 tab[idx[2] + 2],
903 tab[idx[2] + 3],
904 tab[idx[3]],
905 tab[idx[3] + 1],
906 tab[idx[3] + 2],
907 tab[idx[3] + 3]
908 };
909 vsetvlmax_e8m1();
910 return v_int8x16(vle8_v_i8m1(elems));
911}
912 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
913 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
914 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
915
916 inline v_int16x8 v_lut(const short* tab, const int* idx)
917{
918 short CV_DECL_ALIGNED(32) elems[8] =
919 {
920 tab[idx[0]],
921 tab[idx[1]],
922 tab[idx[2]],
923 tab[idx[3]],
924 tab[idx[4]],
925 tab[idx[5]],
926 tab[idx[6]],
927 tab[idx[7]]
928 };
929 vsetvlmax_e16m1();
930 return v_int16x8(vle16_v_i16m1(elems));
931}
932 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
933{
934 short CV_DECL_ALIGNED(32) elems[8] =
935 {
936 tab[idx[0]],
937 tab[idx[0] + 1],
938 tab[idx[1]],
939 tab[idx[1] + 1],
940 tab[idx[2]],
941 tab[idx[2] + 1],
942 tab[idx[3]],
943 tab[idx[3] + 1]
944 };
945 vsetvlmax_e16m1();
946 return v_int16x8(vle16_v_i16m1(elems));
947}
948 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
949{
950 short CV_DECL_ALIGNED(32) elems[8] =
951 {
952 tab[idx[0]],
953 tab[idx[0] + 1],
954 tab[idx[0] + 2],
955 tab[idx[0] + 3],
956 tab[idx[1]],
957 tab[idx[1] + 1],
958 tab[idx[1] + 2],
959 tab[idx[1] + 3]
960 };
961 vsetvlmax_e16m1();
962 return v_int16x8(vle16_v_i16m1(elems));
963}
964 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
965 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
966 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
967
968 inline v_int32x4 v_lut(const int* tab, const int* idx)
969{
970 int CV_DECL_ALIGNED(32) elems[4] =
971 {
972 tab[idx[0]],
973 tab[idx[1]],
974 tab[idx[2]],
975 tab[idx[3]]
976 };
977 vsetvlmax_e32m1();
978 return v_int32x4(vle32_v_i32m1(elems));
979}
980 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
981{
982 int CV_DECL_ALIGNED(32) elems[4] =
983 {
984 tab[idx[0]],
985 tab[idx[0] + 1],
986 tab[idx[1]],
987 tab[idx[1] + 1]
988 };
989 vsetvlmax_e32m1();
990 return v_int32x4(vle32_v_i32m1(elems));
991}
992 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
993{
994 vsetvlmax_e32m1();
995 return v_int32x4(vle32_v_i32m1(tab + idx[0]));
996}
997
998 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
999 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1000 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1001
1002 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1003{
1004 int64_t CV_DECL_ALIGNED(32) elems[2] =
1005 {
1006 tab[idx[0]],
1007 tab[idx[1]]
1008 };
1009 vsetvlmax_e64m1();
1010 return v_int64x2(vle64_v_i64m1(elems));
1011}
1012 inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
1013{
1014 vsetvlmax_e64m1();
1015 return v_int64x2(vle64_v_i64m1(tab + idx[0]));
1016}
1017 inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1018 inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1019
1020 inline v_float32x4 v_lut(const float* tab, const int* idx)
1021{
1022 float CV_DECL_ALIGNED(32) elems[4] =
1023 {
1024 tab[idx[0]],
1025 tab[idx[1]],
1026 tab[idx[2]],
1027 tab[idx[3]]
1028 };
1029 vsetvlmax_e32m1();
1030 return v_float32x4(vle32_v_f32m1(elems));
1031}
1032 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1033{
1034 float CV_DECL_ALIGNED(32) elems[4] =
1035 {
1036 tab[idx[0]],
1037 tab[idx[0] + 1],
1038 tab[idx[1]],
1039 tab[idx[1] + 1]
1040 };
1041 vsetvlmax_e32m1();
1042 return v_float32x4(vle32_v_f32m1(elems));
1043}
1044 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1045{
1046 vsetvlmax_e32m1();
1047 return v_float32x4(vle32_v_f32m1(tab + idx[0]));
1048}
1049
1050 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1051{
1052 int CV_DECL_ALIGNED(32) elems[4] =
1053 {
1054 tab[v_extract_n<0>(idxvec)],
1055 tab[v_extract_n<1>(idxvec)],
1056 tab[v_extract_n<2>(idxvec)],
1057 tab[v_extract_n<3>(idxvec)]
1058 };
1059 vsetvlmax_e32m1();
1060 return v_int32x4(vle32_v_i32m1(elems));
1061}
1062
1063 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1064{
1065 unsigned CV_DECL_ALIGNED(32) elems[4] =
1066 {
1067 tab[v_extract_n<0>(idxvec)],
1068 tab[v_extract_n<1>(idxvec)],
1069 tab[v_extract_n<2>(idxvec)],
1070 tab[v_extract_n<3>(idxvec)]
1071 };
1072 vsetvlmax_e32m1();
1073 return v_uint32x4(vle32_v_u32m1(elems));
1074}
1075
1076 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1077{
1078 float CV_DECL_ALIGNED(32) elems[4] =
1079 {
1080 tab[v_extract_n<0>(idxvec)],
1081 tab[v_extract_n<1>(idxvec)],
1082 tab[v_extract_n<2>(idxvec)],
1083 tab[v_extract_n<3>(idxvec)]
1084 };
1085 vsetvlmax_e32m1();
1086 return v_float32x4(vle32_v_f32m1(elems));
1087}
1088
1089 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1090{
1091 int CV_DECL_ALIGNED(32) idx[4];
1092 v_store_aligned(idx, idxvec);
1093
1094 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1095 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1096}
1097
1098 #if CV_SIMD128_64F
1099 inline v_float64x2 v_lut(const double* tab, const int* idx)
1100{
1101 double CV_DECL_ALIGNED(32) elems[2] =
1102 {
1103 tab[idx[0]],
1104 tab[idx[1]]
1105 };
1106 vsetvlmax_e64m1();
1107 return v_float64x2(vle64_v_f64m1(elems));
1108}
1109
1110 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1111{
1112 vsetvlmax_e64m1();
1113 return v_float64x2(vle64_v_f64m1(tab + idx[0]));
1114}
1115
1116 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1117{
1118 double CV_DECL_ALIGNED(32) elems[2] =
1119 {
1120 tab[v_extract_n<0>(idxvec)],
1121 tab[v_extract_n<1>(idxvec)]
1122 };
1123 vsetvlmax_e64m1();
1124 return v_float64x2(vle64_v_f64m1(elems));
1125}
1126
1127 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1128{
1129 int CV_DECL_ALIGNED(32) idx[4] = {0};
1130 v_store_aligned(idx, idxvec);
1131
1132 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1133 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1134}
1135 #endif
1136
1138
1139 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1140{
1141 ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
1142 v_store(ptr, a);
1143 v_store(ptr + 8, b);
1144 vsetvlmax_e8m1();
1145 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
1146}
1147
1148 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1149 const v_uint32x4& c, const v_uint32x4& d)
1150{
1151 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
1152 v_store(ptr, a);
1153 v_store(ptr + 4, b);
1154 v_store(ptr + 8, c);
1155 v_store(ptr + 12, d);
1156 vsetvlmax_e8m1();
1157 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
1158}
1159
1160 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1161 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1162 const v_uint64x2& g, const v_uint64x2& h)
1163{
1164 uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
1165 v_store(ptr, a);
1166 v_store(ptr + 2, b);
1167 v_store(ptr + 4, c);
1168 v_store(ptr + 6, d);
1169 v_store(ptr + 8, e);
1170 v_store(ptr + 10, f);
1171 v_store(ptr + 12, g);
1172 v_store(ptr + 14, h);
1173 vsetvlmax_e8m1();
1174 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
1175}
1176
1178 #define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
1179 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1180 { \
1181 vsetvlmax_e##width##m1(); \
1182 return _Tpvec(intrin(a, b)); \
1183 } \
1184 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1185 { \
1186 vsetvlmax_e##width##m1(); \
1187 a = _Tpvec(intrin(a, b)); \
1188 return a; \
1189 }
1190
1191OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8)
1192OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8)
1193OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8)
1194OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8)
1195OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8)
1196OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8)
1197OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16)
1198OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16)
1199OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16)
1200OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16)
1201OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16)
1202OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16)
1203OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32)
1204OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32)
1205OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32)
1206OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32)
1207OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32)
1208OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32)
1209OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32)
1210OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32)
1211OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32)
1212OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32)
1213OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32)
1214OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32)
1215OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64)
1216OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64)
1217OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64)
1218OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64)
1219OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64)
1220OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64)
1221OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64)
1222OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64)
1223 #if CV_SIMD128_64F
1224OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64)
1225OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64)
1226OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64)
1227OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64)
1228 #endif
1229
1230
1232
1233 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
1234 OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
1235 OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
1236 OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
1237 inline _Tpvec operator ~ (const _Tpvec& a) \
1238 { \
1239 vsetvlmax_e##width##m1(); \
1240 return _Tpvec(vnot_v_##suffix##m1(a)); \
1241 }
1242
1243OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8)
1244OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8)
1245OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16)
1246OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16)
1247OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32)
1248OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32)
1249OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64)
1250OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64)
1251
1252 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1253 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1254 { \
1255 vsetvlmax_e32m1(); \
1256 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1257 } \
1258 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1259 { \
1260 vsetvlmax_e32m1(); \
1261 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1262 return a; \
1263 }
1264
1265OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1266OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1267OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1268
1269 inline v_float32x4 operator ~ (const v_float32x4& a)
1270{
1271 vsetvlmax_e32m1();
1272 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
1273}
1274
1275 #if CV_SIMD128_64F
1276 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1277 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1278 { \
1279 vsetvlmax_e64m1(); \
1280 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1281 } \
1282 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1283 { \
1284 vsetvlmax_e64m1(); \
1285 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1286 return a; \
1287 }
1288
1289OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1290OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1291OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1292
1293 inline v_float64x2 operator ~ (const v_float64x2& a)
1294{
1295 vsetvlmax_e64m1();
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
1297}
1298 #endif
1299
1301
1302 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1303 inline _Tpvec operator << (const _Tpvec& a, int n) \
1304 { \
1305 vsetvlmax_e##width##m1(); \
1306 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1307 } \
1308 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1309 { \
1310 vsetvlmax_e##width##m1(); \
1311 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1312 } \
1313 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1314 { \
1315 vsetvlmax_e##width##m1(); \
1316 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1317 } \
1318 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1319 { \
1320 vsetvlmax_e##width##m1(); \
1321 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1322 }
1323
1324 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1325 inline _Tpvec operator << (const _Tpvec& a, int n) \
1326 { \
1327 vsetvlmax_e##width##m1(); \
1328 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1329 } \
1330 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1331 { \
1332 vsetvlmax_e##width##m1(); \
1333 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1334 } \
1335 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1336 { \
1337 vsetvlmax_e##width##m1(); \
1338 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1339 } \
1340 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1341 { \
1342 vsetvlmax_e##width##m1(); \
1343 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1344 }
1345
1346OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8)
1347OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16)
1348OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32)
1349OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64)
1350OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8)
1351OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16)
1352OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32)
1353OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64)
1354
1355
1357
1358 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1359 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1360 { \
1361 vsetvlmax_e##width##m1(); \
1362 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1363 }
1364
1365 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1366 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1367 { \
1368 vsetvlmax_e##width##m1(); \
1369 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1370 }
1371
1372 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
1373 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1374 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1375 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
1376 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
1377 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
1378 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
1379
1380 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
1381 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1382 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1383 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
1384 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
1385 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
1386 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
1387
1388 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
1389 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
1390 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
1391 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
1392 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
1393 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
1394 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
1395
1396
1397OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8)
1398OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16)
1399OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32)
1400OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64)
1401OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8)
1402OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16)
1403OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32)
1404OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64)
1405OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32)
1406 #if CV_SIMD128_64F
1407OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64)
1408 #endif
1409
1410 inline v_float32x4 v_not_nan(const v_float32x4& a)
1411{ return a == a; }
1412
1413 #if CV_SIMD128_64F
1414 inline v_float64x2 v_not_nan(const v_float64x2& a)
1415{ return a == a; }
1416 #endif
1417
1419
1420 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
1421 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1422 { \
1423 vsetvlmax_e##width##m1(); \
1424 return _Tpvec(intrin(a, b)); \
1425 }
1426
1427OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8)
1428OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
1429OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8)
1430OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8)
1431OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16)
1432OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
1433OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16)
1434OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16)
1435OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32)
1436OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
1437OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32)
1438OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32)
1439OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32)
1440OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32)
1441OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64)
1442OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
1443OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64)
1444OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64)
1445 #if CV_SIMD128_64F
1446OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64)
1447OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64)
1448 #endif
1449
1451
1452OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
1453OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
1454OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
1455OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
1456OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
1457OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
1458OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
1459OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
1460OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
1461OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
1462OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
1463OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
1464
1466
1467 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
1468 inline scalartype v_reduce_sum(const _Tpvec& a) \
1469 { \
1470 vsetvlmax_e##wwidth##m1(); \
1471 _nwTpvec zero = vzero_##wsuffix##m1(); \
1472 _nwTpvec res = vzero_##wsuffix##m1(); \
1473 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
1474 return (scalartype)(_wTpvec(res).get0()); \
1475 }
1476
1477OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
1478OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
1479OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu)
1480OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum)
1481OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu)
1482OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum)
1483OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum)
1484OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
1485OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
1486 #if CV_SIMD128_64F
1487OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum)
1488 #endif
1489
1490
1491 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
1492 inline scalartype v_reduce_##func(const _Tpvec& a) \
1493 { \
1494 vsetvlmax_e##width##m1(); \
1495 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
1496 return scalartype(res.get0()); \
1497 }
1498
1499OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu)
1500OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin)
1501OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu)
1502OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin)
1503OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu)
1504OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin)
1505OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin)
1506OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu)
1507OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax)
1508OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu)
1509OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax)
1510OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu)
1511OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax)
1512OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax)
1513
1514
1515 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1516 const v_float32x4& c, const v_float32x4& d)
1517{
1518 float CV_DECL_ALIGNED(32) elems[4] =
1519 {
1520 v_reduce_sum(a),
1521 v_reduce_sum(b),
1522 v_reduce_sum(c),
1523 v_reduce_sum(d)
1524 };
1525 vsetvlmax_e32m1();
1526 return v_float32x4(vle32_v_f32m1(elems));
1527}
1528
1530
1531 inline v_float32x4 v_sqrt(const v_float32x4& x)
1532{
1533 vsetvlmax_e32m1();
1534 return v_float32x4(vfsqrt_v_f32m1(x));
1535}
1536
1537 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1538{
1539 v_float32x4 one = v_setall_f32(1.0f);
1540 return one / v_sqrt(x);
1541}
1542
1543 #if CV_SIMD128_64F
1544 inline v_float64x2 v_sqrt(const v_float64x2& x)
1545{
1546 vsetvlmax_e64m1();
1547 return v_float64x2(vfsqrt_v_f64m1(x));
1548}
1549
1550 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1551{
1552 v_float64x2 one = v_setall_f64(1.0f);
1553 return one / v_sqrt(x);
1554}
1555 #endif
1556
1557 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1558{
1559 vsetvlmax_e32m1();
1560 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1561 return v_sqrt(x);
1562}
1563
1564 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1565{
1566 vsetvlmax_e32m1();
1567 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1568}
1569
1570 #if CV_SIMD128_64F
1571 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1572{
1573 vsetvlmax_e64m1();
1574 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1575 return v_sqrt(x);
1576}
1577
1578 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1579{
1580 vsetvlmax_e64m1();
1581 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1582}
1583 #endif
1584
1586
1587 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1588{
1589 vsetvlmax_e32m1();
1590 return v_float32x4(vfmacc_vv_f32m1(c, a, b));
1591}
1592 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1593{
1594 vsetvlmax_e32m1();
1595 return v_int32x4(vmacc_vv_i32m1(c, a, b));
1596}
1597
1598 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1599{
1600 return v_fma(a, b, c);
1601}
1602
1603 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1604{
1605 return v_fma(a, b, c);
1606}
1607
1608 #if CV_SIMD128_64F
1609 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1610{
1611 vsetvlmax_e64m1();
1612 return v_float64x2(vfmacc_vv_f64m1(c, a, b));
1613}
1614
1615 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1616{
1617 return v_fma(a, b, c);
1618}
1619 #endif
1620
1622
1623 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
1624 inline bool v_check_all(const _Tpvec& a) \
1625 { \
1626 vsetvlmax_e##width##m1(); \
1627 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
1628 return (v.val[0] | v.val[1]) == 0; \
1629 } \
1630 inline bool v_check_any(const _Tpvec& a) \
1631 { \
1632 vsetvlmax_e##width##m1(); \
1633 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
1634 return (v.val[0] | v.val[1]) != 0; \
1635 }
1636
1637OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8)
1638OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16)
1639OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32)
1640OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64)
1641
1642
1643inline bool v_check_all(const v_int8x16& a)
1644{ return v_check_all(v_reinterpret_as_u8(a)); }
1645 inline bool v_check_any(const v_int8x16& a)
1646{ return v_check_any(v_reinterpret_as_u8(a)); }
1647
1648 inline bool v_check_all(const v_int16x8& a)
1649{ return v_check_all(v_reinterpret_as_u16(a)); }
1650 inline bool v_check_any(const v_int16x8& a)
1651{ return v_check_any(v_reinterpret_as_u16(a)); }
1652
1653 inline bool v_check_all(const v_int32x4& a)
1654{ return v_check_all(v_reinterpret_as_u32(a)); }
1655 inline bool v_check_any(const v_int32x4& a)
1656{ return v_check_any(v_reinterpret_as_u32(a)); }
1657
1658 inline bool v_check_all(const v_float32x4& a)
1659{ return v_check_all(v_reinterpret_as_u32(a)); }
1660 inline bool v_check_any(const v_float32x4& a)
1661{ return v_check_any(v_reinterpret_as_u32(a)); }
1662
1663 inline bool v_check_all(const v_int64x2& a)
1664{ return v_check_all(v_reinterpret_as_u64(a)); }
1665 inline bool v_check_any(const v_int64x2& a)
1666{ return v_check_any(v_reinterpret_as_u64(a)); }
1667
1668 #if CV_SIMD128_64F
1669 inline bool v_check_all(const v_float64x2& a)
1670{ return v_check_all(v_reinterpret_as_u64(a)); }
1671 inline bool v_check_any(const v_float64x2& a)
1672{ return v_check_any(v_reinterpret_as_u64(a)); }
1673 #endif
1674
1676
1677 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1678 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1679 { \
1680 return v_max(a, b) - v_min(a, b); \
1681 }
1682
1683OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
1684OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
1685OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
1686OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
1687 #if CV_SIMD128_64F
1688OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
1689 #endif
1690OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
1691OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
1692
1693 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
1694 inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1695 { \
1696 vsetvlmax_e##width##m1(); \
1697 return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
1698 }
1699
1700OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
1701OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
1702OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
1703
1704 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1705 inline _Tprvec v_abs(const _Tpvec& a) \
1706 { \
1707 return v_absdiff(a, v_setzero_##suffix()); \
1708 }
1709
1710OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
1711OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
1712OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
1713OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
1714 #if CV_SIMD128_64F
1715OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
1716 #endif
1717
1718
1719 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1720 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1721 { \
1722 return v_reduce_sum(v_absdiff(a, b)); \
1723 }
1724
1725OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
1726OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
1727OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
1728OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
1729OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
1730OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
1731OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
1732
1733
1734
1735 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
1736 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1737 { \
1738 vsetvlmax_e##width##m1(); \
1739 return _Tpvec(merge(ne(mask, 0), b, a)); \
1740 }
1741
1742OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
1743OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
1744OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
1745OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
1746OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
1747OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
1748OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
1749 #if CV_SIMD128_64F
1750OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
1751 #endif
1752
1754
1755 #define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
1756 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1757 { \
1758 vsetvlmax_e##width##m1(); \
1759 return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1760 } \
1761 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1762 { \
1763 vsetvlmax_e##width##m1(); \
1764 return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1765 } \
1766 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1767 { return a; } \
1768 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1769 { \
1770 vsetvlmax_e##width##m1(); \
1771 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
1772 } \
1773 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1774 { \
1775 vsetvlmax_e##width##m1(); \
1776 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
1777 } \
1778 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1779 { CV_UNUSED(b); return a; }
1780
1781
1782OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8)
1783OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8)
1784OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16)
1785OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16)
1786OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32)
1787OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32)
1788OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32)
1789OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64)
1790OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64)
1791 #if CV_SIMD128_64F
1792OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64)
1793 #endif
1794
1796
1797 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1798{
1799 vsetvlmax_e32m1();
1800 return v_float32x4(vfcvt_f_x_v_f32m1(a));
1801}
1802
1803 #if CV_SIMD128_64F
1804 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1805{
1806 double arr[4] = {a.val[0], a.val[1], 0, 0};
1807 vsetvlmax_e64m2();
1808 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1809 vsetvlmax_e32m1();
1810 return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1811}
1812
1813 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1814{
1815 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
1816 vsetvlmax_e64m2();
1817 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1818 vsetvlmax_e32m1();
1819 return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1820}
1821
1822 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1823{
1824 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1825 vsetvlmax_e64m2();
1826 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1827 double CV_DECL_ALIGNED(32) elems[2] =
1828 {
1829 ptr[0], ptr[1]
1830 };
1831 vsetvlmax_e64m1();
1832 return v_float64x2(vle64_v_f64m1(elems));
1833}
1834
1835 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1836{
1837 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1838 vsetvlmax_e64m2();
1839 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1840 double CV_DECL_ALIGNED(32) elems[2] =
1841 {
1842 ptr[2], ptr[3]
1843 };
1844 vsetvlmax_e64m1();
1845 return v_float64x2(vle64_v_f64m1(elems));
1846}
1847
1848 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1849{
1850 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1851 vsetvlmax_e64m2();
1852 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1853 double CV_DECL_ALIGNED(32) elems[2] =
1854 {
1855 ptr[0], ptr[1]
1856 };
1857 vsetvlmax_e64m1();
1858 return v_float64x2(vle64_v_f64m1(elems));
1859}
1860
1861 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1862{
1863 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1864 vsetvlmax_e64m2();
1865 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1866 double CV_DECL_ALIGNED(32) elems[2] =
1867 {
1868 ptr[2], ptr[3]
1869 };
1870 vsetvlmax_e64m1();
1871 return v_float64x2(vle64_v_f64m1(elems));
1872}
1873
1874 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1875{
1876 vsetvlmax_e64m1();
1877 return v_float64x2(vfcvt_f_x_v_f64m1(a));
1878}
1879 #endif
1880
1882
1883 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1884 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
1885 { \
1886 return v_setall_##suffix(v_extract_n<i>(v)); \
1887 }
1888
1889OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
1890OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
1891OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
1892OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
1893OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
1894OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
1895OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
1896OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
1897OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
1898 #if CV_SIMD128_64F
1899OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
1900 #endif
1901
1903
1904 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
1905 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1906 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1907 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1908 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1909 { \
1910 _Tp CV_DECL_ALIGNED(32) elems0[4] = \
1911 { \
1912 v_extract_n<0>(a0), \
1913 v_extract_n<0>(a1), \
1914 v_extract_n<0>(a2), \
1915 v_extract_n<0>(a3) \
1916 }; \
1917 b0 = v_load(elems0); \
1918 _Tp CV_DECL_ALIGNED(32) elems1[4] = \
1919 { \
1920 v_extract_n<1>(a0), \
1921 v_extract_n<1>(a1), \
1922 v_extract_n<1>(a2), \
1923 v_extract_n<1>(a3) \
1924 }; \
1925 b1 = v_load(elems1); \
1926 _Tp CV_DECL_ALIGNED(32) elems2[4] = \
1927 { \
1928 v_extract_n<2>(a0), \
1929 v_extract_n<2>(a1), \
1930 v_extract_n<2>(a2), \
1931 v_extract_n<2>(a3) \
1932 }; \
1933 b2 = v_load(elems2); \
1934 _Tp CV_DECL_ALIGNED(32) elems3[4] = \
1935 { \
1936 v_extract_n<3>(a0), \
1937 v_extract_n<3>(a1), \
1938 v_extract_n<3>(a2), \
1939 v_extract_n<3>(a3) \
1940 }; \
1941 b3 = v_load(elems3); \
1942 }
1943
1944OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
1945OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
1946OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
1947
1948
1949
1950 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
1951 inline _Tpvec v_reverse(const _Tpvec& a) \
1952 { \
1953 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
1954 _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
1955 v_store(ptra, a); \
1956 for (int i = 0; i < _Tpvec::nlanes; i++) \
1957 { \
1958 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
1959 } \
1960 return v_load(ptr); \
1961 }
1962
1963OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8)
1964OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8)
1965OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16)
1966OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16)
1967OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32)
1968OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32)
1969OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32)
1970OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64)
1971OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64)
1972 #if CV_SIMD128_64F
1973OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64)
1974 #endif
1975
1977
1978 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
1979 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1980 { \
1981 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1982 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1983 v_store_low(lptr, a); \
1984 v_store_high(hptr, a); \
1985 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1986 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1987 } \
1988 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1989 { \
1990 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1991 v_store_low(lptr, a); \
1992 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1993 } \
1994 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1995 { \
1996 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1997 v_store_high(hptr, a); \
1998 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1999 } \
2000 inline _Tpwvec v_load_expand(const _Tp* ptr) \
2001 { \
2002 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
2003 }
2004
2005OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1)
2006OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1)
2007OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1)
2008OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1)
2009OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1)
2010OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1)
2011
2012 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2013{
2014 vsetvlmax_e32m1();
2015 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
2016}
2017
2018 inline v_int32x4 v_load_expand_q(const schar* ptr)
2019{
2020 vsetvlmax_e32m1();
2021 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
2022}
2023
2024
2025 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
2026 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2027 { \
2028 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2029 v_store(arr, a); \
2030 v_store(arr + _wTpvec::nlanes, b); \
2031 vsetvlmax_e##width##m2(); \
2032 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
2033 } \
2034 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2035 { \
2036 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2037 v_store(arr, a); \
2038 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2039 vsetvlmax_e##width##m2(); \
2040 v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
2041 } \
2042 template<int n> inline \
2043 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2044 { \
2045 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2046 v_store(arr, a); \
2047 v_store(arr + _wTpvec::nlanes, b); \
2048 vsetvlmax_e##width##m2(); \
2049 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
2050 } \
2051 template<int n> inline \
2052 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2053 { \
2054 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2055 v_store(arr, a); \
2056 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2057 vsetvlmax_e##width##m2(); \
2058 v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
2059 }
2060
2061OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
2062OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
2063OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
2064OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
2065OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
2066OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
2067
2068
2069 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
2070 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2071 { \
2072 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2073 v_store(arr, a); \
2074 v_store(arr + _wTpvec::nlanes, b); \
2075 vsetvlmax_e##width##m2(); \
2076 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
2077 } \
2078 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2079 { \
2080 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2081 v_store(arr, a); \
2082 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2083 vsetvlmax_e##width##m2(); \
2084 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
2085 } \
2086 template<int n> inline \
2087 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2088 { \
2089 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2090 v_store(arr, a); \
2091 v_store(arr + _wTpvec::nlanes, b); \
2092 vsetvlmax_e##width##m2(); \
2093 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
2094 } \
2095 template<int n> inline \
2096 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2097 { \
2098 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2099 v_store(arr, a); \
2100 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2101 vsetvlmax_e##width##m2(); \
2102 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
2103 }
2104
2105OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
2106OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
2107
2108
2109 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
2110 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2111 { \
2112 _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
2113 _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
2114 _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
2115 _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
2116 v_store(ptra0, a0); \
2117 v_store(ptra1, a1); \
2118 int i; \
2119 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2120 { \
2121 ptrb0[i*2] = ptra0[i]; \
2122 ptrb0[i*2+1] = ptra1[i]; \
2123 } \
2124 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2125 { \
2126 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2127 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2128 } \
2129 b0 = v_load(ptrb0); \
2130 b1 = v_load(ptrb1); \
2131 } \
2132 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2133 { \
2134 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2135 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2136 v_store_low(ptra, a); \
2137 v_store_low(ptrb, b); \
2138 return v_load_halves(ptra, ptrb); \
2139 } \
2140 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2141 { \
2142 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2143 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2144 v_store_high(ptra, a); \
2145 v_store_high(ptrb, b); \
2146 return v_load_halves(ptra, ptrb); \
2147 } \
2148 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2149 { \
2150 c = v_combine_low(a, b); \
2151 d = v_combine_high(a, b); \
2152 }
2153
2154OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
2155OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
2156OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
2157OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16)
2158OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32)
2159OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32)
2160OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32)
2161 #if CV_SIMD128_64F
2162OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64)
2163 #endif
2164
2165
2166 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
2167 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2168 { \
2169 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2170 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2171 int i, i2; \
2172 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2173 { \
2174 ptra[i] = ptr[i2]; \
2175 ptrb[i] = ptr[i2+1]; \
2176 } \
2177 a = v_load(ptra); \
2178 b = v_load(ptrb); \
2179 } \
2180 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2181 { \
2182 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2183 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2184 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2185 int i, i3; \
2186 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2187 { \
2188 ptra[i] = ptr[i3]; \
2189 ptrb[i] = ptr[i3+1]; \
2190 ptrc[i] = ptr[i3+2]; \
2191 } \
2192 a = v_load(ptra); \
2193 b = v_load(ptrb); \
2194 c = v_load(ptrc); \
2195 } \
2196 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2197 v_##_Tpvec& c, v_##_Tpvec& d) \
2198 { \
2199 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2200 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2201 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2202 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2203 int i, i4; \
2204 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2205 { \
2206 ptra[i] = ptr[i4]; \
2207 ptrb[i] = ptr[i4+1]; \
2208 ptrc[i] = ptr[i4+2]; \
2209 ptrd[i] = ptr[i4+3]; \
2210 } \
2211 a = v_load(ptra); \
2212 b = v_load(ptrb); \
2213 c = v_load(ptrc); \
2214 d = v_load(ptrd); \
2215 } \
2216 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2217 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2218 { \
2219 int i, i2; \
2220 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2221 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2222 v_store(ptra, a); \
2223 v_store(ptrb, b); \
2224 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2225 { \
2226 ptr[i2] = ptra[i]; \
2227 ptr[i2+1] = ptrb[i]; \
2228 } \
2229 } \
2230 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2231 const v_##_Tpvec& c, hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2232 { \
2233 int i, i3; \
2234 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2235 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2236 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2237 v_store(ptra, a); \
2238 v_store(ptrb, b); \
2239 v_store(ptrc, c); \
2240 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2241 { \
2242 ptr[i3] = ptra[i]; \
2243 ptr[i3+1] = ptrb[i]; \
2244 ptr[i3+2] = ptrc[i]; \
2245 } \
2246 } \
2247 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2248 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2249 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED ) \
2250 { \
2251 int i, i4; \
2252 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2253 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2254 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2255 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2256 v_store(ptra, a); \
2257 v_store(ptrb, b); \
2258 v_store(ptrc, c); \
2259 v_store(ptrd, d); \
2260 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2261 { \
2262 ptr[i4] = ptra[i]; \
2263 ptr[i4+1] = ptrb[i]; \
2264 ptr[i4+2] = ptrc[i]; \
2265 ptr[i4+3] = ptrd[i]; \
2266 } \
2267 } \
2268 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2269 { \
2270 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2271 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2272 v_store(ptrvec, vec); \
2273 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2274 { \
2275 ptr[4*i ] = ptrvec[4*i ]; \
2276 ptr[4*i+1] = ptrvec[4*i+2]; \
2277 ptr[4*i+2] = ptrvec[4*i+1]; \
2278 ptr[4*i+3] = ptrvec[4*i+3]; \
2279 } \
2280 return v_load(ptr); \
2281 } \
2282 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2283 { \
2284 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2285 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2286 v_store(ptrvec, vec); \
2287 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2288 { \
2289 ptr[8*i ] = ptrvec[4*i ]; \
2290 ptr[8*i+1] = ptrvec[4*i+4]; \
2291 ptr[8*i+2] = ptrvec[4*i+1]; \
2292 ptr[8*i+3] = ptrvec[4*i+5]; \
2293 ptr[8*i+4] = ptrvec[4*i+2]; \
2294 ptr[8*i+5] = ptrvec[4*i+6]; \
2295 ptr[8*i+6] = ptrvec[4*i+3]; \
2296 ptr[8*i+7] = ptrvec[4*i+7]; \
2297 } \
2298 return v_load(ptr); \
2299 }
2300
2301OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
2302OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
2303OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
2304OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16)
2305OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32)
2306OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32)
2307OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32)
2308OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
2309OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
2310 #if CV_SIMD128_64F
2311OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64)
2312 #endif
2313
2315
2316 static const unsigned char popCountTable[] =
2317{
2318 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2319 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2320 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2321 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2322 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2323 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2324 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2325 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2326 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2327 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2328 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2329 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2330 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2331 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2332 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2333 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2334};
2335
2336 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2337 inline _rTpvec v_popcount(const _Tpvec& a) \
2338 { \
2339 uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
2340 v_store(ptra, v_reinterpret_as_u8(a)); \
2341 _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2342 v_store(ptr, v_setzero_##suffix()); \
2343 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2344 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2345 return v_load(ptr); \
2346 }
2347
2348OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
2349OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
2350OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
2351OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
2352OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
2353OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
2354OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
2355OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
2356
2357
2358
2359 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
2360 inline int v_signmask(const _Tpvec& a) \
2361 { \
2362 int mask = 0; \
2363 vsetvlmax_e##width##m1(); \
2364 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
2365 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2366 mask |= (int)(tmp.val[i]) << i; \
2367 return mask; \
2368 }
2369
2370OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7)
2371OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15)
2372OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31)
2373OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63)
2374
2375inline int v_signmask(const v_int8x16& a)
2376{ return v_signmask(v_reinterpret_as_u8(a)); }
2377 inline int v_signmask(const v_int16x8& a)
2378{ return v_signmask(v_reinterpret_as_u16(a)); }
2379 inline int v_signmask(const v_int32x4& a)
2380{ return v_signmask(v_reinterpret_as_u32(a)); }
2381 inline int v_signmask(const v_float32x4& a)
2382{ return v_signmask(v_reinterpret_as_u32(a)); }
2383 inline int v_signmask(const v_int64x2& a)
2384{ return v_signmask(v_reinterpret_as_u64(a)); }
2385 #if CV_SIMD128_64F
2386 inline int v_signmask(const v_float64x2& a)
2387{ return v_signmask(v_reinterpret_as_u64(a)); }
2388 #endif
2389
2390
2392
2393 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2394 inline int v_scan_forward(const _Tpvec& a) \
2395 { \
2396 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2397 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2398 for (int i = 0; i < _Tpvec::nlanes; i++) \
2399 if(int(ptr[i]) < 0) \
2400 return i; \
2401 return 0; \
2402 }
2403
2404OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
2405OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
2406OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
2407OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
2408OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
2409OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
2410OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
2411OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
2412OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
2413 #if CV_SIMD128_64F
2414OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
2415 #endif
2416
2418
2419 #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
2420 inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
2421 { \
2422 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2423 _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
2424 v_store(ptrvec, vec); \
2425 for (int i = 0; i < _Tpvec::nlanes/4; i++) \
2426 { \
2427 ptr[3*i ] = ptrvec[4*i ]; \
2428 ptr[3*i+1] = ptrvec[4*i+2]; \
2429 ptr[3*i+2] = ptrvec[4*i+2]; \
2430 } \
2431 return v_load(ptr); \
2432 }
2433
2434OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
2435OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
2436OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
2437OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
2438OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
2439OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
2440OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
2441
2442
2443
2444
2445 #if CV_FP16
2446 inline v_float32x4 v_load_expand(const float16_t* ptr)
2447{
2448 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
2449}
2450
2451 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2452{
2453 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
2454}
2455 #else
2456 inline v_float32x4 v_load_expand(const float16_t* ptr)
2457{
2458 const int N = 4;
2459 float buf[N];
2460 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2461 return v_load(buf);
2462}
2463
2464 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2465{
2466 const int N = 4;
2467 float buf[N];
2468 v_store(buf, v);
2469 for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2470}
2471 #endif
2472
2474
2475 inline v_int32x4 v_round(const v_float32x4& a)
2476{
2477 vsetvlmax_e32m1();
2478 return v_int32x4(vfcvt_x_f_v_i32m1(a));
2479}
2480
2481 inline v_int32x4 v_floor(const v_float32x4& a)
2482{
2483 v_float32x4 ZP5 = v_setall_f32(0.5f);
2484 v_float32x4 t = a - ZP5;
2485 vsetvlmax_e32m1();
2486 return v_int32x4(vfcvt_x_f_v_i32m1(t));
2487}
2488
2489 inline v_int32x4 v_ceil(const v_float32x4& a)
2490{
2491 v_float32x4 ZP5 = v_setall_f32(0.5f);
2492 v_float32x4 t = a + ZP5;
2493 vsetvlmax_e32m1();
2494 return v_int32x4(vfcvt_x_f_v_i32m1(t));
2495}
2496
2497 inline v_int32x4 v_trunc(const v_float32x4& a)
2498{
2499 vsetvlmax_e32m1();
2500 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
2501}
2502 #if CV_SIMD128_64F
2503 inline v_int32x4 v_round(const v_float64x2& a)
2504{
2505 double arr[4] = {a.val[0], a.val[1], 0, 0};
2506 vsetvlmax_e64m2();
2507 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2508 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2509}
2510
2511 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2512{
2513 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2514 vsetvlmax_e64m2();
2515 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2516 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2517}
2518
2519 inline v_int32x4 v_floor(const v_float64x2& a)
2520{
2521 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2522 vsetvlmax_e64m2();
2523 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2524 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2525}
2526
2527 inline v_int32x4 v_ceil(const v_float64x2& a)
2528{
2529 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2530 vsetvlmax_e64m2();
2531 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2532 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2533}
2534
2535 inline v_int32x4 v_trunc(const v_float64x2& a)
2536{
2537 double arr[4] = {a.val[0], a.val[1], 0, 0};
2538 vsetvlmax_e64m2();
2539 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2540 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
2541}
2542 #endif
2543
2544
2546
2547 // 16 >> 32
2548 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
2549{
2550 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2551 v_int32x4 t1, t2;
2552 vsetvlmax_e32m2();
2553 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2554 v_load_deinterleave(ptr, t1, t2);
2555 return t1 + t2;
2556}
2557 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2558{
2559 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2560 v_int32x4 t1, t2;
2561 vsetvlmax_e32m2();
2562 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2563 v_load_deinterleave(ptr, t1, t2);
2564 return t1 + t2 + c;
2565}
2566
2567 // 32 >> 64
2568 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
2569{
2570 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2571 v_int64x2 t1, t2;
2572 vsetvlmax_e64m2();
2573 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2574 v_load_deinterleave(ptr, t1, t2);
2575 return t1 + t2;
2576}
2577 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2578{
2579 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2580 v_int64x2 t1, t2;
2581 vsetvlmax_e64m2();
2582 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2583 v_load_deinterleave(ptr, t1, t2);
2584 return t1 + t2 + c;
2585}
2586
2587 // 8 >> 32
2588 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
2589{
2590 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2591 v_uint32x4 t1, t2, t3, t4;
2592 vsetvlmax_e32m4();
2593 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2594 v_load_deinterleave(ptr, t1, t2, t3, t4);
2595 return t1 + t2 + t3 + t4;
2596}
2597 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
2598 const v_uint32x4& c)
2599{
2600 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2601 v_uint32x4 t1, t2, t3, t4;
2602 vsetvlmax_e32m4();
2603 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2604 v_load_deinterleave(ptr, t1, t2, t3, t4);
2605 return t1 + t2 + t3 + t4 + c;
2606}
2607
2608 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
2609{
2610 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2611 v_int32x4 t1, t2, t3, t4;
2612 vsetvlmax_e32m4();
2613 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2614 v_load_deinterleave(ptr, t1, t2, t3, t4);
2615 return t1 + t2 + t3 + t4;
2616}
2617 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
2618 const v_int32x4& c)
2619{
2620 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2621 v_int32x4 t1, t2, t3, t4;
2622 vsetvlmax_e32m4();
2623 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2624 v_load_deinterleave(ptr, t1, t2, t3, t4);
2625 return t1 + t2 + t3 + t4 + c;
2626}
2627
2628 // 16 >> 64
2629 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
2630{
2631 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2632 v_uint64x2 t1, t2, t3, t4;
2633 vsetvlmax_e64m4();
2634 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2635 v_load_deinterleave(ptr, t1, t2, t3, t4);
2636 return t1 + t2 + t3 + t4;
2637}
2638 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2639{
2640 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2641 v_uint64x2 t1, t2, t3, t4;
2642 vsetvlmax_e64m4();
2643 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2644 v_load_deinterleave(ptr, t1, t2, t3, t4);
2645 return t1 + t2 + t3 + t4 + c;
2646}
2647
2648 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
2649{
2650 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2651 v_int64x2 t1, t2, t3, t4;
2652 vsetvlmax_e64m4();
2653 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2654 v_load_deinterleave(ptr, t1, t2, t3, t4);
2655 return t1 + t2 + t3 + t4;
2656}
2657 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
2658 const v_int64x2& c)
2659{
2660 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2661 v_int64x2 t1, t2, t3, t4;
2662 vsetvlmax_e64m4();
2663 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2664 v_load_deinterleave(ptr, t1, t2, t3, t4);
2665 return t1 + t2 + t3 + t4 + c;
2666}
2667
2668 // 32 >> 64f
2669 #if CV_SIMD128_64F
2670 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2671{ return v_cvt_f64(v_dotprod(a, b)); }
2672 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2673 const v_float64x2& c)
2674{ return v_dotprod_expand(a, b) + c; }
2675 #endif
2676
2678
2679 // 16 >> 32
2680 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
2681{
2682 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2683 vsetvlmax_e32m2();
2684 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2685 v_int32x4 t1 = v_load(ptr);
2686 v_int32x4 t2 = v_load(ptr+4);
2687 return t1 + t2;
2688}
2689 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2690{
2691 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2692 vsetvlmax_e32m2();
2693 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2694 v_int32x4 t1 = v_load(ptr);
2695 v_int32x4 t2 = v_load(ptr+4);
2696 return t1 + t2 + c;
2697}
2698
2699 // 32 >> 64
2700 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
2701{
2702 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2703 vsetvlmax_e64m2();
2704 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2705 v_int64x2 t1 = v_load(ptr);
2706 v_int64x2 t2 = v_load(ptr+2);
2707 return t1 + t2;
2708}
2709 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2710{
2711 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2712 vsetvlmax_e64m2();
2713 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2714 v_int64x2 t1 = v_load(ptr);
2715 v_int64x2 t2 = v_load(ptr+2);
2716 return t1 + t2 + c;
2717}
2718
2719
2720 // 8 >> 32
2721 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
2722{
2723 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2724 vsetvlmax_e32m4();
2725 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2726 v_uint32x4 t1 = v_load(ptr);
2727 v_uint32x4 t2 = v_load(ptr+4);
2728 v_uint32x4 t3 = v_load(ptr+8);
2729 v_uint32x4 t4 = v_load(ptr+12);
2730 return t1 + t2 + t3 + t4;
2731}
2732 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
2733{
2734 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2735 vsetvlmax_e32m4();
2736 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2737 v_uint32x4 t1 = v_load(ptr);
2738 v_uint32x4 t2 = v_load(ptr+4);
2739 v_uint32x4 t3 = v_load(ptr+8);
2740 v_uint32x4 t4 = v_load(ptr+12);
2741 return t1 + t2 + t3 + t4 + c;
2742}
2743 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
2744{
2745 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2746 vsetvlmax_e32m4();
2747 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2748 v_int32x4 t1 = v_load(ptr);
2749 v_int32x4 t2 = v_load(ptr+4);
2750 v_int32x4 t3 = v_load(ptr+8);
2751 v_int32x4 t4 = v_load(ptr+12);
2752 return t1 + t2 + t3 + t4;
2753}
2754 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
2755{
2756 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2757 vsetvlmax_e32m4();
2758 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2759 v_int32x4 t1 = v_load(ptr);
2760 v_int32x4 t2 = v_load(ptr+4);
2761 v_int32x4 t3 = v_load(ptr+8);
2762 v_int32x4 t4 = v_load(ptr+12);
2763 return t1 + t2 + t3 + t4 + c;
2764}
2765
2766 // 16 >> 64
2767 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
2768{
2769 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2770 vsetvlmax_e64m4();
2771 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2772 v_uint64x2 t1 = v_load(ptr);
2773 v_uint64x2 t2 = v_load(ptr+2);
2774 v_uint64x2 t3 = v_load(ptr+4);
2775 v_uint64x2 t4 = v_load(ptr+6);
2776 return t1 + t2 + t3 + t4;
2777}
2778 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2779{
2780 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2781 vsetvlmax_e64m4();
2782 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2783 v_uint64x2 t1 = v_load(ptr);
2784 v_uint64x2 t2 = v_load(ptr+2);
2785 v_uint64x2 t3 = v_load(ptr+4);
2786 v_uint64x2 t4 = v_load(ptr+6);
2787 return t1 + t2 + t3 + t4 + c;
2788}
2789 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
2790{
2791 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2792 vsetvlmax_e64m4();
2793 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2794 v_int64x2 t1 = v_load(ptr);
2795 v_int64x2 t2 = v_load(ptr+2);
2796 v_int64x2 t3 = v_load(ptr+4);
2797 v_int64x2 t4 = v_load(ptr+6);
2798 return t1 + t2 + t3 + t4;
2799}
2800 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
2801{
2802 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2803 vsetvlmax_e64m4();
2804 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2805 v_int64x2 t1 = v_load(ptr);
2806 v_int64x2 t2 = v_load(ptr+2);
2807 v_int64x2 t3 = v_load(ptr+4);
2808 v_int64x2 t4 = v_load(ptr+6);
2809 return t1 + t2 + t3 + t4 + c;
2810}
2811
2812 // 32 >> 64f
2813 #if CV_SIMD128_64F
2814 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2815{ return v_cvt_f64(v_dotprod_fast(a, b)); }
2816 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2817{ return v_dotprod_expand_fast(a, b) + c; }
2818 #endif
2819
2820
2821 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
2822 const v_float32x4& m1, const v_float32x4& m2,
2823 const v_float32x4& m3)
2824{
2825 vsetvlmax_e32m1();
2826 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2827 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2828 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2829 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
2830 return v_float32x4(res);
2831}
2832
2833 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
2834 const v_float32x4& m1, const v_float32x4& m2,
2835 const v_float32x4& a)
2836{
2837 vsetvlmax_e32m1();
2838 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2839 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2840 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2841 return v_float32x4(res) + a;
2842}
2843
2844 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
2845 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
2846 { \
2847 _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
2848 vsetvlmax_e##width##m2(); \
2849 vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
2850 vsetvlmax_e##width##m1(); \
2851 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
2852 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
2853 }
2854
2855OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16)
2856OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16)
2857OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32)
2858OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32)
2859OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64)
2860
2861
2862inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
2863{
2864 vsetvlmax_e16m1();
2865 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
2866}
2867 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
2868{
2869 vsetvlmax_e16m1();
2870 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
2871}
2872
2873
2875
2876 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
2877 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2878 { \
2879 _wTpvec c, d; \
2880 v_mul_expand(a, b, c, d); \
2881 return v_pack(c, d); \
2882 } \
2883 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2884 { \
2885 a = a * b; \
2886 return a; \
2887 }
2888
2889OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
2890OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
2891OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
2892OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
2893
2894
2895 inline void v_cleanup() {}
2896
2897CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2898
2899
2900}
2901
2902 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks
Definition: intrin_cpp.hpp:1784
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75
Definition: intrin_rvv.hpp:421
Definition: intrin_rvv.hpp:517
Definition: intrin_rvv.hpp:326
Definition: intrin_rvv.hpp:390
Definition: intrin_rvv.hpp:484
Definition: intrin_rvv.hpp:261
Definition: intrin_rvv.hpp:294
Definition: intrin_rvv.hpp:358
Definition: intrin_rvv.hpp:452
Definition: intrin_rvv.hpp:228
Definition: intrin_rvv.hpp:92
Definition: intrin_rvv.hpp:120
Definition: intrin_rvv.hpp:60
Definition: intrin_rvv.hpp:82
Definition: intrin_rvv.hpp:111
Definition: intrin_rvv.hpp:36
Definition: intrin_rvv.hpp:141
Definition: intrin_rvv.hpp:48
Definition: intrin_rvv.hpp:72
Definition: intrin_rvv.hpp:102
Definition: intrin_rvv.hpp:24
Definition: intrin_rvv.hpp:129