5
#ifndef OPENCV_HAL_INTRIN_WASM_HPP
6
#define OPENCV_HAL_INTRIN_WASM_HPP
11
#include "opencv2/core/saturate.hpp"
14
#define CV_SIMD128_64F 0
15
#define CV_SIMD128_FP16 0
22CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
24
#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
26
#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
27
#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
28
#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
29
#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
30
#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
31
#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
32
#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
33
#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
40
typedef
uchar lane_type;
41
typedef
v128_t vector_type;
45
explicit
v_uint8x16(v128_t v) : val(v) {}
46
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
47
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
49
uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
50
val = wasm_v128_load(v);
55
return
(uchar)wasm_i8x16_extract_lane(val, 0);
63
typedef
schar lane_type;
64
typedef
v128_t vector_type;
68
explicit
v_int8x16(v128_t v) : val(v) {}
69
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
70
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
72
schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
73
val = wasm_v128_load(v);
78
return
wasm_i8x16_extract_lane(val, 0);
86
typedef
ushort lane_type;
87
typedef
v128_t vector_type;
91
explicit
v_uint16x8(v128_t v) : val(v) {}
92
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
94
ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
95
val = wasm_v128_load(v);
100
return
(ushort)wasm_i16x8_extract_lane(val, 0);
108
typedef
short
lane_type;
109
typedef
v128_t vector_type;
113
explicit
v_int16x8(v128_t v) : val(v) {}
114
v_int16x8(
short
v0,
short
v1,
short
v2,
short
v3,
short
v4,
short
v5,
short
v6,
short
v7)
116
short
v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
117
val = wasm_v128_load(v);
122
return
wasm_i16x8_extract_lane(val, 0);
130
typedef
unsigned
lane_type;
131
typedef
v128_t vector_type;
135
explicit
v_uint32x4(v128_t v) : val(v) {}
136
v_uint32x4(
unsigned
v0,
unsigned
v1,
unsigned
v2,
unsigned
v3)
138
unsigned
v[] = {v0, v1, v2, v3};
139
val = wasm_v128_load(v);
142
unsigned
get0()
const
144
return
(
unsigned)wasm_i32x4_extract_lane(val, 0);
152
typedef
int
lane_type;
153
typedef
v128_t vector_type;
157
explicit
v_int32x4(v128_t v) : val(v) {}
158
v_int32x4(
int
v0,
int
v1,
int
v2,
int
v3)
160
int
v[] = {v0, v1, v2, v3};
161
val = wasm_v128_load(v);
166
return
wasm_i32x4_extract_lane(val, 0);
174
typedef
float
lane_type;
175
typedef
v128_t vector_type;
179
explicit
v_float32x4(v128_t v) : val(v) {}
180
v_float32x4(
float
v0,
float
v1,
float
v2,
float
v3)
182
float
v[] = {v0, v1, v2, v3};
183
val = wasm_v128_load(v);
188
return
wasm_f32x4_extract_lane(val, 0);
196
typedef
uint64 lane_type;
197
typedef
v128_t vector_type;
201
explicit
v_uint64x2(v128_t v) : val(v) {}
202
v_uint64x2(uint64 v0, uint64 v1)
204
uint64 v[] = {v0, v1};
205
val = wasm_v128_load(v);
210
return
(uint64)wasm_i64x2_extract_lane(val, 0);
218
typedef
int64 lane_type;
219
typedef
v128_t vector_type;
223
explicit
v_int64x2(v128_t v) : val(v) {}
224
v_int64x2(int64 v0, int64 v1)
226
int64 v[] = {v0, v1};
227
val = wasm_v128_load(v);
232
return
wasm_i64x2_extract_lane(val, 0);
240
typedef
double
lane_type;
241
typedef
v128_t vector_type;
245
explicit
v_float64x2(v128_t v) : val(v) {}
246
v_float64x2(
double
v0,
double
v1)
248
double
v[] = {v0, v1};
249
val = wasm_v128_load(v);
254
return
wasm_f64x2_extract_lane(val, 0);
262
#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
263
inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
264OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
265OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
266OPENCV_HAL_IMPL_REINTERPRET_INT(ushort,
short)
267OPENCV_HAL_IMPL_REINTERPRET_INT(
short,
short)
268OPENCV_HAL_IMPL_REINTERPRET_INT(
unsigned,
int)
269OPENCV_HAL_IMPL_REINTERPRET_INT(
int,
int)
270OPENCV_HAL_IMPL_REINTERPRET_INT(
float,
int)
271OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
272OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
273OPENCV_HAL_IMPL_REINTERPRET_INT(
double, int64)
275
static
const
unsigned
char
popCountTable[] =
277
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
278
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
279
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
281
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
282
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
283
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
285
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
286
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
287
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
289
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
290
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
291
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
296
static
v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
297
return
wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
300
static
v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
301
return
wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
304
static
v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
305
return
wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
308
static
v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
309
return
wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
312
static
v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
313
return
wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
316
static
v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
317
return
wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
320
static
v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
321
return
wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
324
static
v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
325
return
wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
330
inline
v128_t v128_cvtu8x16_i16x8(
const
v128_t& a)
332
const
v128_t z = wasm_i8x16_splat(0);
333
return
wasm_unpacklo_i8x16(a, z);
335
inline
v128_t v128_cvti8x16_i16x8(
const
v128_t& a)
336{
return
wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
338
inline
v128_t v128_cvtu8x16_i32x4(
const
v128_t& a)
340
const
v128_t z = wasm_i8x16_splat(0);
341
return
wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
343
inline
v128_t v128_cvti8x16_i32x4(
const
v128_t& a)
345
v128_t r = wasm_unpacklo_i8x16(a, a);
346
r = wasm_unpacklo_i8x16(r, r);
347
return
wasm_i32x4_shr(r, 24);
350
inline
v128_t v128_cvtu16x8_i32x4(
const
v128_t& a)
352
const
v128_t z = wasm_i8x16_splat(0);
353
return
wasm_unpacklo_i16x8(a, z);
355
inline
v128_t v128_cvti16x8_i32x4(
const
v128_t& a)
356{
return
wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
358
inline
v128_t v128_cvtu32x4_i64x2(
const
v128_t& a)
360
const
v128_t z = wasm_i8x16_splat(0);
361
return
wasm_unpacklo_i32x4(a, z);
363
inline
v128_t v128_cvti32x4_i64x2(
const
v128_t& a)
364{
return
wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
367
inline
v128_t v128_cvtu8x16_i16x8_high(
const
v128_t& a)
369
const
v128_t z = wasm_i8x16_splat(0);
370
return
wasm_unpackhi_i8x16(a, z);
372
inline
v128_t v128_cvti8x16_i16x8_high(
const
v128_t& a)
373{
return
wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
375
inline
v128_t v128_cvtu16x8_i32x4_high(
const
v128_t& a)
377
const
v128_t z = wasm_i8x16_splat(0);
378
return
wasm_unpackhi_i16x8(a, z);
380
inline
v128_t v128_cvti16x8_i32x4_high(
const
v128_t& a)
381{
return
wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
383
inline
v128_t v128_cvtu32x4_i64x2_high(
const
v128_t& a)
385
const
v128_t z = wasm_i8x16_splat(0);
386
return
wasm_unpackhi_i32x4(a, z);
388
inline
v128_t v128_cvti32x4_i64x2_high(
const
v128_t& a)
389{
return
wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
391
#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
392
inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
393
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
394
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
395
{ return _Tpvec(a.val); }
397OPENCV_HAL_IMPL_WASM_INITVEC(
v_uint8x16, uchar, u8, i8x16, schar)
398OPENCV_HAL_IMPL_WASM_INITVEC(
v_int8x16, schar, s8, i8x16, schar)
399OPENCV_HAL_IMPL_WASM_INITVEC(
v_uint16x8, ushort, u16, i16x8,
short)
400OPENCV_HAL_IMPL_WASM_INITVEC(
v_int16x8,
short, s16, i16x8,
short)
401OPENCV_HAL_IMPL_WASM_INITVEC(
v_uint32x4,
unsigned, u32, i32x4,
int)
402OPENCV_HAL_IMPL_WASM_INITVEC(
v_int32x4,
int, s32, i32x4,
int)
403OPENCV_HAL_IMPL_WASM_INITVEC(
v_float32x4,
float, f32, f32x4,
float)
404OPENCV_HAL_IMPL_WASM_INITVEC(
v_uint64x2, uint64, u64, i64x2, int64)
405OPENCV_HAL_IMPL_WASM_INITVEC(
v_int64x2, int64, s64, i64x2, int64)
406OPENCV_HAL_IMPL_WASM_INITVEC(
v_float64x2,
double, f64, f64x2,
double)
411
v128_t maxval = wasm_i16x8_splat(255);
412
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
413
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
414
return
v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
418
v128_t maxval = wasm_i16x8_splat(127);
419
v128_t minval = wasm_i16x8_splat(-128);
420
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
421
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
422
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
423
v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
424
return
v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
428
v128_t maxval = wasm_i32x4_splat(65535);
429
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
430
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
431
return
v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
435
v128_t maxval = wasm_i32x4_splat(32767);
436
v128_t minval = wasm_i32x4_splat(-32768);
437
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
438
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
439
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
440
v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
441
return
v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
445
return
v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
449
return
v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
453
v128_t maxval = wasm_i16x8_splat(255);
454
v128_t minval = wasm_i16x8_splat(0);
455
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
456
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
457
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
458
v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
459
return
v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
463
v128_t maxval = wasm_i32x4_splat(65535);
464
v128_t minval = wasm_i32x4_splat(0);
465
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
466
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
467
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
468
v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
469
return
v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
475
v128_t delta = wasm_i16x8_splat(((
short)1 << (n-1)));
476
v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
477
v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
478
v128_t maxval = wasm_i16x8_splat(255);
479
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
480
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
481
return
v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
486
v128_t delta = wasm_i16x8_splat(((
short)1 << (n-1)));
487
v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
488
v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
489
v128_t maxval = wasm_i16x8_splat(127);
490
v128_t minval = wasm_i16x8_splat(-128);
491
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
492
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
493
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
494
v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
495
return
v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
500
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
501
v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
502
v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
503
v128_t maxval = wasm_i32x4_splat(65535);
504
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
505
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
506
return
v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
511
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
512
v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
513
v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
514
v128_t maxval = wasm_i32x4_splat(32767);
515
v128_t minval = wasm_i16x8_splat(-32768);
516
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
517
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
518
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
519
v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
520
return
v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
525
v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
526
v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
527
v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
528
return
v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
533
v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
534
v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
535
v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
536
return
v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
541
v128_t delta = wasm_i16x8_splat(((
short)1 << (n-1)));
542
v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
543
v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
544
v128_t maxval = wasm_i16x8_splat(255);
545
v128_t minval = wasm_i16x8_splat(0);
546
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
547
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
548
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
549
v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
550
return
v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
555
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
556
v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
557
v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
558
v128_t maxval = wasm_i32x4_splat(65535);
559
v128_t minval = wasm_i16x8_splat(0);
560
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
561
v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
562
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
563
v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
564
return
v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
567
inline
void
v_pack_store(uchar* ptr,
const
v_uint16x8& a)
569
v128_t maxval = wasm_i16x8_splat(255);
570
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
571
v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
573
wasm_v128_store(t_ptr, r);
574
for
(
int
i=0; i<8; ++i) {
578
inline
void
v_pack_store(schar* ptr,
const
v_int16x8& a)
580
v128_t maxval = wasm_i16x8_splat(127);
581
v128_t minval = wasm_i16x8_splat(-128);
582
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
583
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
584
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
586
wasm_v128_store(t_ptr, r);
587
for
(
int
i=0; i<8; ++i) {
591
inline
void
v_pack_store(ushort* ptr,
const
v_uint32x4& a)
593
v128_t maxval = wasm_i32x4_splat(65535);
594
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
595
v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
597
wasm_v128_store(t_ptr, r);
598
for
(
int
i=0; i<4; ++i) {
602
inline
void
v_pack_store(
short* ptr,
const
v_int32x4& a)
604
v128_t maxval = wasm_i32x4_splat(32767);
605
v128_t minval = wasm_i32x4_splat(-32768);
606
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
607
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
608
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
610
wasm_v128_store(t_ptr, r);
611
for
(
int
i=0; i<4; ++i) {
615
inline
void
v_pack_store(
unsigned* ptr,
const
v_uint64x2& a)
617
v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
619
wasm_v128_store(t_ptr, r);
620
for
(
int
i=0; i<2; ++i) {
624
inline
void
v_pack_store(
int* ptr,
const
v_int64x2& a)
626
v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
628
wasm_v128_store(t_ptr, r);
629
for
(
int
i=0; i<2; ++i) {
633
inline
void
v_pack_u_store(uchar* ptr,
const
v_int16x8& a)
635
v128_t maxval = wasm_i16x8_splat(255);
636
v128_t minval = wasm_i16x8_splat(0);
637
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
638
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
639
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
641
wasm_v128_store(t_ptr, r);
642
for
(
int
i=0; i<8; ++i) {
646
inline
void
v_pack_u_store(ushort* ptr,
const
v_int32x4& a)
648
v128_t maxval = wasm_i32x4_splat(65535);
649
v128_t minval = wasm_i32x4_splat(0);
650
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
651
v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
652
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
654
wasm_v128_store(t_ptr, r);
655
for
(
int
i=0; i<4; ++i) {
661
inline
void
v_rshr_pack_store(uchar* ptr,
const
v_uint16x8& a)
663
v128_t delta = wasm_i16x8_splat((
short)(1 << (n-1)));
664
v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
665
v128_t maxval = wasm_i16x8_splat(255);
666
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
667
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
669
wasm_v128_store(t_ptr, r);
670
for
(
int
i=0; i<8; ++i) {
675
inline
void
v_rshr_pack_store(schar* ptr,
const
v_int16x8& a)
677
v128_t delta = wasm_i16x8_splat(((
short)1 << (n-1)));
678
v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
679
v128_t maxval = wasm_i16x8_splat(127);
680
v128_t minval = wasm_i16x8_splat(-128);
681
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
682
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
683
v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
685
wasm_v128_store(t_ptr, r);
686
for
(
int
i=0; i<8; ++i) {
691
inline
void
v_rshr_pack_store(ushort* ptr,
const
v_uint32x4& a)
693
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
694
v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
695
v128_t maxval = wasm_i32x4_splat(65535);
696
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
697
v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
699
wasm_v128_store(t_ptr, r);
700
for
(
int
i=0; i<4; ++i) {
705
inline
void
v_rshr_pack_store(
short* ptr,
const
v_int32x4& a)
707
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
708
v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
709
v128_t maxval = wasm_i32x4_splat(32767);
710
v128_t minval = wasm_i32x4_splat(-32768);
711
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
712
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
713
v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
715
wasm_v128_store(t_ptr, r);
716
for
(
int
i=0; i<4; ++i) {
721
inline
void
v_rshr_pack_store(
unsigned* ptr,
const
v_uint64x2& a)
723
v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
724
v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
725
v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
727
wasm_v128_store(t_ptr, r);
728
for
(
int
i=0; i<2; ++i) {
733
inline
void
v_rshr_pack_store(
int* ptr,
const
v_int64x2& a)
735
v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
736
v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
737
v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
739
wasm_v128_store(t_ptr, r);
740
for
(
int
i=0; i<2; ++i) {
745
inline
void
v_rshr_pack_u_store(uchar* ptr,
const
v_int16x8& a)
747
v128_t delta = wasm_i16x8_splat(((
short)1 << (n-1)));
748
v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
749
v128_t maxval = wasm_i16x8_splat(255);
750
v128_t minval = wasm_i16x8_splat(0);
751
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
752
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
753
v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
755
wasm_v128_store(t_ptr, r);
756
for
(
int
i=0; i<8; ++i) {
761
inline
void
v_rshr_pack_u_store(ushort* ptr,
const
v_int32x4& a)
763
v128_t delta = wasm_i32x4_splat(((
int)1 << (n-1)));
764
v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
765
v128_t maxval = wasm_i32x4_splat(65535);
766
v128_t minval = wasm_i32x4_splat(0);
767
v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
768
v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
769
v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
771
wasm_v128_store(t_ptr, r);
772
for
(
int
i=0; i<4; ++i) {
779
v128_t maxval = wasm_i16x8_splat(255);
780
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
781
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
782
return
v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
788
v128_t maxval = wasm_i32x4_splat(255);
789
v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
790
v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
791
v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
792
v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
793
v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
794
v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795
return
v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
802
v128_t maxval = wasm_i32x4_splat(255);
803
v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
804
v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
805
v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
806
v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
807
v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
808
v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
809
v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
810
v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
811
v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
812
v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813
v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814
v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815
v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
816
v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817
return
v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
824
v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
825
v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
826
v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
827
v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
828
v0 = wasm_f32x4_mul(v0, m0.val);
829
v1 = wasm_f32x4_mul(v1, m1.val);
830
v2 = wasm_f32x4_mul(v2, m2.val);
831
v3 = wasm_f32x4_mul(v3, m3.val);
833
return
v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
840
v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
841
v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
842
v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
843
v0 = wasm_f32x4_mul(v0, m0.val);
844
v1 = wasm_f32x4_mul(v1, m1.val);
845
v2 = wasm_f32x4_mul(v2, m2.val);
847
return
v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
850
#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
851
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
853
return _Tpvec(intrin(a.val, b.val)); \
855
inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
857
a.val = intrin(a.val, b.val); \
861OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_uint8x16, wasm_u8x16_add_saturate)
862OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_uint8x16, wasm_u8x16_sub_saturate)
863OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_int8x16, wasm_i8x16_add_saturate)
864OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_int8x16, wasm_i8x16_sub_saturate)
865OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_uint16x8, wasm_u16x8_add_saturate)
866OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_uint16x8, wasm_u16x8_sub_saturate)
867OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_int16x8, wasm_i16x8_add_saturate)
868OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_int16x8, wasm_i16x8_sub_saturate)
869OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_uint32x4, wasm_i32x4_add)
870OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_uint32x4, wasm_i32x4_sub)
871OPENCV_HAL_IMPL_WASM_BIN_OP(*,
v_uint32x4, wasm_i32x4_mul)
872OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_int32x4, wasm_i32x4_add)
873OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_int32x4, wasm_i32x4_sub)
874OPENCV_HAL_IMPL_WASM_BIN_OP(*,
v_int32x4, wasm_i32x4_mul)
875OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_float32x4, wasm_f32x4_add)
876OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_float32x4, wasm_f32x4_sub)
877OPENCV_HAL_IMPL_WASM_BIN_OP(*,
v_float32x4, wasm_f32x4_mul)
878OPENCV_HAL_IMPL_WASM_BIN_OP(/,
v_float32x4, wasm_f32x4_div)
879OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_uint64x2, wasm_i64x2_add)
880OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_uint64x2, wasm_i64x2_sub)
881OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_int64x2, wasm_i64x2_add)
882OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_int64x2, wasm_i64x2_sub)
883OPENCV_HAL_IMPL_WASM_BIN_OP(+,
v_float64x2, wasm_f64x2_add)
884OPENCV_HAL_IMPL_WASM_BIN_OP(-,
v_float64x2, wasm_f64x2_sub)
885OPENCV_HAL_IMPL_WASM_BIN_OP(*,
v_float64x2, wasm_f64x2_mul)
886OPENCV_HAL_IMPL_WASM_BIN_OP(/,
v_float64x2, wasm_f64x2_div)
889
#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
890
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
893
v_mul_expand(a, b, c, d); \
894
return v_pack(c, d); \
896
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
897
{ a = a * b; return a; }
911
c = v_mul_wrap(a0, b0);
912
d = v_mul_wrap(a1, b1);
921
c = v_mul_wrap(a0, b0);
922
d = v_mul_wrap(a1, b1);
931
c.val = wasm_i32x4_mul(a0.val, b0.val);
932
d.val = wasm_i32x4_mul(a1.val, b1.val);
941
c.val = wasm_i32x4_mul(a0.val, b0.val);
942
d.val = wasm_i32x4_mul(a1.val, b1.val);
951
c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
952
d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
960
v128_t c = wasm_i32x4_mul(a0.val, b0.val);
961
v128_t d = wasm_i32x4_mul(a1.val, b1.val);
962
return
v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
969
v128_t c = wasm_i32x4_mul(a0.val, b0.val);
970
v128_t d = wasm_i32x4_mul(a1.val, b1.val);
971
return
v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
978
v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
979
v128_t a1 = wasm_i32x4_shr(a.val, 16);
980
v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
981
v128_t b1 = wasm_i32x4_shr(b.val, 16);
982
v128_t c = wasm_i32x4_mul(a0, b0);
983
v128_t d = wasm_i32x4_mul(a1, b1);
992
v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
993
v128_t a1 = wasm_i64x2_shr(a.val, 32);
994
v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
995
v128_t b1 = wasm_i64x2_shr(b.val, 32);
996
v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
997
v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
1008
v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1009
v128_t a1 = wasm_u16x8_shr(a.val, 8);
1010
v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1011
v128_t b1 = wasm_u16x8_shr(b.val, 8);
1022
v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1023
v128_t a1 = wasm_i16x8_shr(a.val, 8);
1024
v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1025
v128_t b1 = wasm_i16x8_shr(b.val, 8);
1037
v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1038
v128_t a1 = wasm_u32x4_shr(a.val, 16);
1039
v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1040
v128_t b1 = wasm_u32x4_shr(b.val, 16);
1051
v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1052
v128_t a1 = wasm_i32x4_shr(a.val, 16);
1053
v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1054
v128_t b1 = wasm_i32x4_shr(b.val, 16);
1110
#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1111
OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1112
OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1113
OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1114
inline _Tpvec operator ~ (const _Tpvec& a) \
1116
return _Tpvec(wasm_v128_not(a.val)); \
1137
const
v128_t _1_0 = wasm_f32x4_splat(1.0);
1138
return
v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1148
const
v128_t _1_0 = wasm_f64x2_splat(1.0);
1149
return
v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1152
#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1153
inline _Tpuvec v_abs(const _Tpsvec& x) \
1155
v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1156
v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1157
return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1173
#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1174
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1176
return _Tpvec(intrin(a.val, b.val)); \
1179OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_float32x4, v_min, wasm_f32x4_min)
1180OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_float32x4, v_max, wasm_f32x4_max)
1181OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_float64x2, v_min, wasm_f64x2_min)
1182OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_float64x2, v_max, wasm_f64x2_max)
1184
#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1185
inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1187
return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1189
inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1191
return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1194OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(
v_int8x16, i8x16)
1195OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(
v_int16x8, i16x8)
1196OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(
v_int32x4, i32x4)
1198
#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1199
inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1201
v128_t delta = wasm_##suffix##_splat(deltaNum); \
1202
v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1203
return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1205
inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1207
v128_t delta = wasm_##suffix##_splat(deltaNum); \
1208
v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1209
return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1212OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(
v_uint8x16, i8x16, (schar)0x80)
1213OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(
v_uint16x8, i16x8, (
short)0x8000)
1214OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(
v_uint32x4, i32x4, (
int)0x80000000)
1216
#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1217
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1218
{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1219
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1220
{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1221
inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1222
{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1223
inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1224
{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1225
inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1226
{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1227
inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1228
{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1230OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_uint8x16, u8x16, i8x16)
1231OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_int8x16, i8x16, i8x16)
1232OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_uint16x8, u16x8, i16x8)
1233OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_int16x8, i16x8, i16x8)
1234OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_uint32x4, u32x4, i32x4)
1235OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_int32x4, i32x4, i32x4)
1236OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_float32x4, f32x4, f32x4)
1237OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(
v_float64x2, f64x2, f64x2)
1239
#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1240
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1241
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1242
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1243
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1245OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(
v_uint64x2, v_reinterpret_as_u64)
1246OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(
v_int64x2, v_reinterpret_as_s64)
1250
v128_t z = wasm_i32x4_splat(0x7fffffff);
1251
v128_t t = wasm_i32x4_splat(0x7f800000);
1252
return
v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1256
v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1257
v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1258
return
v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1261OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint8x16, v_add_wrap, wasm_i8x16_add)
1262OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int8x16, v_add_wrap, wasm_i8x16_add)
1263OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint16x8, v_add_wrap, wasm_i16x8_add)
1264OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int16x8, v_add_wrap, wasm_i16x8_add)
1265OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1266OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1267OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1268OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1269
#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1274
uchar a_[16], b_[16];
1275
wasm_v128_store(a_, a.val);
1276
wasm_v128_store(b_, b.val);
1277
for
(
int
i = 0; i < 16; i++)
1278
a_[i] = (uchar)(a_[i] * b_[i]);
1283
schar a_[16], b_[16];
1284
wasm_v128_store(a_, a.val);
1285
wasm_v128_store(b_, b.val);
1286
for
(
int
i = 0; i < 16; i++)
1287
a_[i] = (schar)(a_[i] * b_[i]);
1291OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1292OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1294OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1295OPENCV_HAL_IMPL_WASM_BIN_FUNC(
v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1301{
return
v_add_wrap(a - b, b - a); }
1303{
return
v_add_wrap(a - b, b - a); }
1305{
return
v_max(a, b) - v_min(a, b); }
1311
return
v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1315
return
v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1321
return
v_reinterpret_as_u32((d ^ m) - m);
1332{
return
v_max(a, b) - v_min(a, b); }
1342
return
v_fma(a, b, c);
1357
v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1358
return
v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1362
v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1363
return
v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1366
#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1367
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1369
v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1370
v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1371
return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1373
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1375
v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1376
v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1377
return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1379
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1381
return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1384OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(
v_float32x4, f32x4)
1385OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(
v_float64x2, f64x2)
1387
#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1388
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1390
return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1392
inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1394
return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1396
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1398
return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1400
inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1402
return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1405
inline _Tpuvec v_shl(const _Tpuvec& a) \
1407
return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1410
inline _Tpsvec v_shl(const _Tpsvec& a) \
1412
return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1415
inline _Tpuvec v_shr(const _Tpuvec& a) \
1417
return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1420
inline _Tpsvec v_shr(const _Tpsvec& a) \
1422
return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1430
namespace
hal_wasm_internal
1433
bool
is_invalid = ((imm < 0) || (imm > 16)),
1434
bool
is_first = (imm == 0),
1435
bool
is_second = (imm == 16),
1436
bool
is_other = (((imm > 0) && (imm < 16)))>
1437
class
v_wasm_palignr_u8_class;
1440
class
v_wasm_palignr_u8_class<imm, true, false, false, false>;
1443
class
v_wasm_palignr_u8_class<imm, false, true, false, false>
1446
inline
v128_t operator()(
const
v128_t& a,
const
v128_t&)
const
1453
class
v_wasm_palignr_u8_class<imm, false, false, true, false>
1456
inline
v128_t operator()(
const
v128_t&,
const
v128_t& b)
const
1463
class
v_wasm_palignr_u8_class<imm, false, false, false, true>
1466
inline
v128_t operator()(
const
v128_t& a,
const
v128_t& b)
const
1468
enum
{ imm2 = (
sizeof(v128_t) - imm) };
1469
return
wasm_v8x16_shuffle(a, b,
1470
imm, imm+1, imm+2, imm+3,
1471
imm+4, imm+5, imm+6, imm+7,
1472
imm+8, imm+9, imm+10, imm+11,
1473
imm+12, imm+13, imm+14, imm+15);
1478
inline
v128_t v_wasm_palignr_u8(
const
v128_t& a,
const
v128_t& b)
1480
CV_StaticAssert((imm >= 0) && (imm <= 16),
"Invalid imm for v_wasm_palignr_u8.");
1481
return
v_wasm_palignr_u8_class<imm>()(a, b);
1485
template<
int
imm,
typename
_Tpvec>
1486
inline
_Tpvec v_rotate_right(
const
_Tpvec &a)
1488
using namespace
hal_wasm_internal;
1489
enum
{ imm2 = (imm *
sizeof(
typename
_Tpvec::lane_type)) };
1490
v128_t z = wasm_i8x16_splat(0);
1491
return
_Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1494
template<
int
imm,
typename
_Tpvec>
1495
inline
_Tpvec v_rotate_left(
const
_Tpvec &a)
1497
using namespace
hal_wasm_internal;
1498
enum
{ imm2 = ((_Tpvec::nlanes - imm) *
sizeof(
typename
_Tpvec::lane_type)) };
1499
v128_t z = wasm_i8x16_splat(0);
1500
return
_Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1503
template<
int
imm,
typename
_Tpvec>
1504
inline
_Tpvec v_rotate_right(
const
_Tpvec &a,
const
_Tpvec &b)
1506
using namespace
hal_wasm_internal;
1507
enum
{ imm2 = (imm *
sizeof(
typename
_Tpvec::lane_type)) };
1508
return
_Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1511
template<
int
imm,
typename
_Tpvec>
1512
inline
_Tpvec v_rotate_left(
const
_Tpvec &a,
const
_Tpvec &b)
1514
using namespace
hal_wasm_internal;
1515
enum
{ imm2 = ((_Tpvec::nlanes - imm) *
sizeof(
typename
_Tpvec::lane_type)) };
1516
return
_Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1519
#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1520
inline _Tpvec v_load(const _Tp* ptr) \
1521
{ return _Tpvec(wasm_v128_load(ptr)); } \
1522
inline _Tpvec v_load_aligned(const _Tp* ptr) \
1523
{ return _Tpvec(wasm_v128_load(ptr)); } \
1524
inline _Tpvec v_load_low(const _Tp* ptr) \
1526
_Tp tmp[_Tpvec::nlanes] = {0}; \
1527
for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1530
return _Tpvec(wasm_v128_load(tmp)); \
1532
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1534
_Tp tmp[_Tpvec::nlanes]; \
1535
for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1537
tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1539
return _Tpvec(wasm_v128_load(tmp)); \
1541
inline void v_store(_Tp* ptr, const _Tpvec& a) \
1542
{ wasm_v128_store(ptr, a.val); } \
1543
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1544
{ wasm_v128_store(ptr, a.val); } \
1545
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1546
{ wasm_v128_store(ptr, a.val); } \
1547
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode
) \
1549
wasm_v128_store(ptr, a.val); \
1551
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1553
_Tpvec::lane_type a_[_Tpvec::nlanes]; \
1554
wasm_v128_store(a_, a.val); \
1555
for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1558
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1560
_Tpvec::lane_type a_[_Tpvec::nlanes]; \
1561
wasm_v128_store(a_, a.val); \
1562
for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1563
ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1566OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_uint8x16, uchar)
1567OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_int8x16, schar)
1568OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_uint16x8, ushort)
1569OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_int16x8,
short)
1570OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_uint32x4,
unsigned)
1571OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_int32x4,
int)
1572OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_uint64x2, uint64)
1573OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_int64x2, int64)
1574OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_float32x4,
float)
1575OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(
v_float64x2,
double)
1580{
return
v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1583{
return
v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1586{
return
v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1589{
return
v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1592{
return
v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1595{
return
v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1598{
return
v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1601{
return
v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1604{
return
v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1607{
return
v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1610
#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1611
inline scalartype v_reduce_sum(const _Tpvec& a) \
1613
regtype val = a.val; \
1614
val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1615
val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1616
return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1619OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(
v_uint32x4,
unsigned, v128_t, i32x4, i32x4)
1620OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(
v_int32x4,
int, v128_t, i32x4, i32x4)
1621OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(
v_float32x4,
float, v128_t, f32x4, f32x4)
1626
#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1627
inline scalartype v_reduce_sum(const _Tpvec& a) \
1629
_Tpvec::lane_type a_[_Tpvec::nlanes]; \
1630
wasm_v128_store(a_, a.val); \
1631
scalartype c = a_[0]; \
1632
for (int i = 1; i < _Tpvec::nlanes; i++) \
1637OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(
v_uint8x16,
unsigned)
1638OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(
v_int8x16,
int)
1639OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(
v_uint16x8,
unsigned)
1640OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(
v_int16x8,
int)
1643
#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1644
inline scalartype v_reduce_sum(const _Tpvec& a) \
1646
regtype val = a.val; \
1647
val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1648
return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1650OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(
v_uint64x2, uint64, v128_t, i64x2, i64x2)
1651OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(
v_int64x2, int64, v128_t, i64x2, i64x2)
1652OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(
v_float64x2,
double, v128_t, f64x2,f64x2)
1657
v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1658
v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1659
return
v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1662
#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1663
inline scalartype v_reduce_##func(const _Tpvec& a) \
1665
scalartype buf[_Tpvec::nlanes]; \
1667
scalartype tmp = buf[0]; \
1668
for (int i=1; i<_Tpvec::nlanes; ++i) { \
1669
tmp = scalar_func(tmp, buf[i]); \
1692
v_uint32x4
l16_l32, l16_h32, h16_l32, h16_h32;
1701
v_uint32x4
l16_l32, l16_h32, h16_l32, h16_h32;
1734
v128_t m1 = wasm_i32x4_splat(0x55555555);
1735
v128_t m2 = wasm_i32x4_splat(0x33333333);
1736
v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1738
p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1739
p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1740
p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1746
p += v_rotate_right<1>(p);
1747
return
v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1752
p += v_rotate_right<1>(p);
1753
p += v_rotate_right<2>(p);
1754
return
v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1758
uint64 a_[2], b_[2] = { 0 };
1759
wasm_v128_store(a_, a.val);
1760
for
(
int
i = 0; i < 16; i++)
1761
b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1765{
return
v_popcount(v_reinterpret_as_u8(a)); }
1767{
return
v_popcount(v_reinterpret_as_u16(a)); }
1769{
return
v_popcount(v_reinterpret_as_u32(a)); }
1771{
return
v_popcount(v_reinterpret_as_u64(a)); }
1773
#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1774
inline int v_signmask(const _Tpvec& a) \
1776
_Tpvec::lane_type a_[_Tpvec::nlanes]; \
1777
wasm_v128_store(a_, a.val); \
1779
for (int i = 0; i < _Tpvec::nlanes; i++) \
1780
mask |= (reinterpret_int(a_[i]) < 0) << i; \
1783
inline bool v_check_all(const _Tpvec& a) \
1784
{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1785
inline bool v_check_any(const _Tpvec& a) \
1786
{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1788OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_uint8x16, i8x16, schar)
1789OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_int8x16, i8x16, schar)
1790OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_uint16x8, i16x8,
short)
1791OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_int16x8, i16x8,
short)
1792OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_uint32x4, i32x4,
int)
1793OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_int32x4, i32x4,
int)
1794OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_float32x4, i32x4,
float)
1795OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(
v_float64x2, f64x2,
double)
1797
#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1798
inline bool v_check_all(const _Tpvec& a) \
1800
v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1801
masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1802
masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1803
return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1805
inline bool v_check_any(const _Tpvec& a) \
1807
v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1808
masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1809
masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1810
return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1813OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(
v_int64x2, i32x4, s32)
1814OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(
v_uint64x2, i32x4, u32)
1828
#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1829
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1831
return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1845
#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1846
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1848
b0.val = intrin(a.val); \
1849
b1.val = __CV_CAT(intrin, _high)(a.val); \
1851
inline _Tpwvec v_expand_low(const _Tpvec& a) \
1852
{ return _Tpwvec(intrin(a.val)); } \
1853
inline _Tpwvec v_expand_high(const _Tpvec& a) \
1854
{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1855
inline _Tpwvec v_load_expand(const _Tp* ptr) \
1857
v128_t a = wasm_v128_load(ptr); \
1858
return _Tpwvec(intrin(a)); \
1868
#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \
1869
inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1871
v128_t a = wasm_v128_load(ptr); \
1872
return _Tpvec(intrin(a)); \
1875OPENCV_HAL_IMPL_WASM_EXPAND_Q(
v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1876OPENCV_HAL_IMPL_WASM_EXPAND_Q(
v_int32x4, schar, v128_cvti8x16_i32x4)
1878
#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1879
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1881
b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1882
b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1884
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1886
return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1888
inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1890
return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1892
inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1894
c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1895
d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1898OPENCV_HAL_IMPL_WASM_UNPACKS(
v_uint8x16, i8x16)
1899OPENCV_HAL_IMPL_WASM_UNPACKS(
v_int8x16, i8x16)
1900OPENCV_HAL_IMPL_WASM_UNPACKS(
v_uint16x8, i16x8)
1901OPENCV_HAL_IMPL_WASM_UNPACKS(
v_int16x8, i16x8)
1902OPENCV_HAL_IMPL_WASM_UNPACKS(
v_uint32x4, i32x4)
1903OPENCV_HAL_IMPL_WASM_UNPACKS(
v_int32x4, i32x4)
1907
template<
int
s,
typename
_Tpvec>
1908
inline
_Tpvec
v_extract(
const
_Tpvec& a,
const
_Tpvec& b)
1910
return
v_rotate_right<s>(a, b);
1915
v128_t h = wasm_f32x4_splat(0.5);
1916
return
v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1921
v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1922
v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1923
return
v_int32x4(wasm_i32x4_add(a1, mask));
1928
v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1929
v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1930
return
v_int32x4(wasm_i32x4_sub(a1, mask));
1934{
return
v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1936
#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1937
inline v_int32x4 func(const v_float64x2& a) \
1940
wasm_v128_store(a_, a.val); \
1942
c_[0] = cfunc(a_[0]); \
1943
c_[1] = cfunc(a_[1]); \
1946
return v_int32x4(wasm_v128_load(c_)); \
1952OPENCV_HAL_IMPL_WASM_MATH_FUNC(
v_trunc,
int)
1956
double
a_[2], b_[2];
1957
wasm_v128_store(a_, a.val);
1958
wasm_v128_store(b_, b.val);
1967
#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1968
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1969
const _Tpvec& a2, const _Tpvec& a3, \
1970
_Tpvec& b0, _Tpvec& b1, \
1971
_Tpvec& b2, _Tpvec& b3) \
1973
v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1974
v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1975
v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1976
v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1978
b0.val = wasm_unpacklo_i64x2(t0, t1); \
1979
b1.val = wasm_unpackhi_i64x2(t0, t1); \
1980
b2.val = wasm_unpacklo_i64x2(t2, t3); \
1981
b3.val = wasm_unpackhi_i64x2(t2, t3); \
1984OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(
v_uint32x4, i32x4)
1985OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(
v_int32x4, i32x4)
1986OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(
v_float32x4, i32x4)
1991
v128_t t00 = wasm_v128_load(ptr);
1992
v128_t t01 = wasm_v128_load(ptr + 16);
1994
a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1995
b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
2000
v128_t t00 = wasm_v128_load(ptr);
2001
v128_t t01 = wasm_v128_load(ptr + 16);
2002
v128_t t02 = wasm_v128_load(ptr + 32);
2004
v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2005
v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2006
v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2008
a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2009
b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2010
c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2015
v128_t u0 = wasm_v128_load(ptr);
2016
v128_t u1 = wasm_v128_load(ptr + 16);
2017
v128_t u2 = wasm_v128_load(ptr + 32);
2018
v128_t u3 = wasm_v128_load(ptr + 48);
2020
v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2021
v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022
v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2023
v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2025
a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2026
b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2027
c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2028
d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2033
v128_t v0 = wasm_v128_load(ptr);
2034
v128_t v1 = wasm_v128_load(ptr + 8);
2036
a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29);
2037
b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31);
2042
v128_t t00 = wasm_v128_load(ptr);
2043
v128_t t01 = wasm_v128_load(ptr + 8);
2044
v128_t t02 = wasm_v128_load(ptr + 16);
2046
v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2047
v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2048
v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2050
a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2051
b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2052
c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2057
v128_t u0 = wasm_v128_load(ptr);
2058
v128_t u1 = wasm_v128_load(ptr + 8);
2059
v128_t u2 = wasm_v128_load(ptr + 16);
2060
v128_t u3 = wasm_v128_load(ptr + 24);
2062
v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27);
2063
v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27);
2064
v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31);
2065
v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31);
2067
a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2068
b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2069
c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2070
d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2075
v128_t v0 = wasm_v128_load(ptr);
2076
v128_t v1 = wasm_v128_load(ptr + 4);
2078
a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
2079
b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
2084
v128_t t00 = wasm_v128_load(ptr);
2085
v128_t t01 = wasm_v128_load(ptr + 4);
2086
v128_t t02 = wasm_v128_load(ptr + 8);
2088
v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2089
v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2090
v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2092
a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2093
b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2094
c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2109
v128_t v0 = wasm_v128_load(ptr);
2110
v128_t v1 = wasm_v128_load((ptr + 4));
2112
a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
2113
b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31);
2118
v128_t t00 = wasm_v128_load(ptr);
2119
v128_t t01 = wasm_v128_load(ptr + 4);
2120
v128_t t02 = wasm_v128_load(ptr + 8);
2122
v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2123
v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2124
v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2126
a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2127
b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2128
c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2143
v128_t t0 = wasm_v128_load(ptr);
2144
v128_t t1 = wasm_v128_load(ptr + 2);
2146
a.val = wasm_unpacklo_i64x2(t0, t1);
2147
b.val = wasm_unpackhi_i64x2(t0, t1);
2152
v128_t t0 = wasm_v128_load(ptr);
2153
v128_t t1 = wasm_v128_load(ptr + 2);
2154
v128_t t2 = wasm_v128_load(ptr + 4);
2156
a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2157
b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2158
c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2164
v128_t t0 = wasm_v128_load(ptr);
2165
v128_t t1 = wasm_v128_load(ptr + 2);
2166
v128_t t2 = wasm_v128_load(ptr + 4);
2167
v128_t t3 = wasm_v128_load(ptr + 6);
2169
a.val = wasm_unpacklo_i64x2(t0, t2);
2170
b.val = wasm_unpackhi_i64x2(t0, t2);
2171
c.val = wasm_unpacklo_i64x2(t1, t3);
2172
d.val = wasm_unpackhi_i64x2(t1, t3);
2178
hal::StoreMode
= hal::STORE_UNALIGNED)
2180
v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2181
v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2183
wasm_v128_store(ptr, v0);
2184
wasm_v128_store(ptr + 16, v1);
2188
const
v_uint8x16& c, hal::StoreMode
= hal::STORE_UNALIGNED)
2190
v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2191
v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2192
v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2194
v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2195
v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2196
v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2198
wasm_v128_store(ptr, t10);
2199
wasm_v128_store(ptr + 16, t11);
2200
wasm_v128_store(ptr + 32, t12);
2205
hal::StoreMode
= hal::STORE_UNALIGNED)
2211
v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val);
2212
v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val);
2213
v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val);
2214
v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val);
2216
v128_t v0 = wasm_unpacklo_i8x16(u0, u2);
2217
v128_t v1 = wasm_unpackhi_i8x16(u0, u2);
2218
v128_t v2 = wasm_unpacklo_i8x16(u1, u3);
2219
v128_t v3 = wasm_unpackhi_i8x16(u1, u3);
2221
wasm_v128_store(ptr, v0);
2222
wasm_v128_store(ptr + 16, v1);
2223
wasm_v128_store(ptr + 32, v2);
2224
wasm_v128_store(ptr + 48, v3);
2228
hal::StoreMode
= hal::STORE_UNALIGNED)
2230
v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2231
v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2233
wasm_v128_store(ptr, v0);
2234
wasm_v128_store(ptr + 8, v1);
2239
hal::StoreMode
= hal::STORE_UNALIGNED)
2241
v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2242
v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2243
v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2245
v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2246
v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2247
v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2249
wasm_v128_store(ptr, t10);
2250
wasm_v128_store(ptr + 8, t11);
2251
wasm_v128_store(ptr + 16, t12);
2256
hal::StoreMode
= hal::STORE_UNALIGNED)
2262
v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val);
2263
v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val);
2264
v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val);
2265
v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val);
2267
v128_t v0 = wasm_unpacklo_i16x8(u0, u2);
2268
v128_t v1 = wasm_unpackhi_i16x8(u0, u2);
2269
v128_t v2 = wasm_unpacklo_i16x8(u1, u3);
2270
v128_t v3 = wasm_unpackhi_i16x8(u1, u3);
2272
wasm_v128_store(ptr, v0);
2273
wasm_v128_store(ptr + 8, v1);
2274
wasm_v128_store(ptr + 16, v2);
2275
wasm_v128_store(ptr + 24, v3);
2279
hal::StoreMode
= hal::STORE_UNALIGNED)
2281
v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2282
v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2284
wasm_v128_store(ptr, v0);
2285
wasm_v128_store(ptr + 4, v1);
2289
const
v_uint32x4& c, hal::StoreMode
= hal::STORE_UNALIGNED)
2291
v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2292
v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2293
v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2295
v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2296
v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2297
v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2299
wasm_v128_store(ptr, t10);
2300
wasm_v128_store(ptr + 4, t11);
2301
wasm_v128_store(ptr + 8, t12);
2306
hal::StoreMode
= hal::STORE_UNALIGNED)
2311
wasm_v128_store(ptr, v0.val);
2312
wasm_v128_store(ptr + 4, v1.val);
2313
wasm_v128_store(ptr + 8, v2.val);
2314
wasm_v128_store(ptr + 12, v3.val);
2319
hal::StoreMode
= hal::STORE_UNALIGNED)
2321
v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2322
v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2324
wasm_v128_store(ptr, v0);
2325
wasm_v128_store(ptr + 4, v1);
2329
const
v_float32x4& c, hal::StoreMode
= hal::STORE_UNALIGNED)
2331
v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2332
v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2333
v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2335
v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2336
v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2337
v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2339
wasm_v128_store(ptr, t10);
2340
wasm_v128_store(ptr + 4, t11);
2341
wasm_v128_store(ptr + 8, t12);
2346
hal::StoreMode
= hal::STORE_UNALIGNED)
2351
wasm_v128_store(ptr, v0.val);
2352
wasm_v128_store(ptr + 4, v1.val);
2353
wasm_v128_store(ptr + 8, v2.val);
2354
wasm_v128_store(ptr + 12, v3.val);
2358
hal::StoreMode
= hal::STORE_UNALIGNED)
2360
v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2361
v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2363
wasm_v128_store(ptr, v0);
2364
wasm_v128_store(ptr + 2, v1);
2368
const
v_uint64x2& c, hal::StoreMode
= hal::STORE_UNALIGNED)
2370
v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2371
v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2372
v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2374
wasm_v128_store(ptr, v0);
2375
wasm_v128_store(ptr + 2, v1);
2376
wasm_v128_store(ptr + 4, v2);
2381
hal::StoreMode
= hal::STORE_UNALIGNED)
2383
v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2384
v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2385
v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2386
v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2388
wasm_v128_store(ptr, v0);
2389
wasm_v128_store(ptr + 2, v1);
2390
wasm_v128_store(ptr + 4, v2);
2391
wasm_v128_store(ptr + 6, v3);
2394
#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2395
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2398
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2399
a0 = v_reinterpret_as_##suffix0(a1); \
2400
b0 = v_reinterpret_as_##suffix0(b1); \
2402
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2404
_Tpvec1 a1, b1, c1; \
2405
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2406
a0 = v_reinterpret_as_##suffix0(a1); \
2407
b0 = v_reinterpret_as_##suffix0(b1); \
2408
c0 = v_reinterpret_as_##suffix0(c1); \
2410
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2412
_Tpvec1 a1, b1, c1, d1; \
2413
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2414
a0 = v_reinterpret_as_##suffix0(a1); \
2415
b0 = v_reinterpret_as_##suffix0(b1); \
2416
c0 = v_reinterpret_as_##suffix0(c1); \
2417
d0 = v_reinterpret_as_##suffix0(d1); \
2419
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2420
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2422
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2423
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2424
v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2426
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2427
const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2429
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2430
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2431
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2432
v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2434
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2435
const _Tpvec0& c0, const _Tpvec0& d0, \
2436
hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2438
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2439
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2440
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2441
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2442
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2453
return
v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2459
wasm_v128_store(a_, a.val);
2461
c_[0] = (float)(a_[0]);
2462
c_[1] = (float)(a_[1]);
2470
double
a_[2], b_[2];
2471
wasm_v128_store(a_, a.val);
2472
wasm_v128_store(b_, b.val);
2474
c_[0] = (float)(a_[0]);
2475
c_[1] = (float)(a_[1]);
2476
c_[2] = (float)(b_[0]);
2477
c_[3] = (float)(b_[1]);
2483
#ifdef __wasm_unimplemented_simd128__
2484
v128_t p = v128_cvti32x4_i64x2(a.val);
2488
wasm_v128_store(a_, a.val);
2490
c_[0] = (double)(a_[0]);
2491
c_[1] = (double)(a_[1]);
2498
#ifdef __wasm_unimplemented_simd128__
2499
v128_t p = v128_cvti32x4_i64x2_high(a.val);
2503
wasm_v128_store(a_, a.val);
2505
c_[0] = (double)(a_[2]);
2506
c_[1] = (double)(a_[3]);
2514
wasm_v128_store(a_, a.val);
2516
c_[0] = (double)(a_[0]);
2517
c_[1] = (double)(a_[1]);
2524
wasm_v128_store(a_, a.val);
2526
c_[0] = (double)(a_[2]);
2527
c_[1] = (double)(a_[3]);
2533
#ifdef __wasm_unimplemented_simd128__
2534
return
v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2537
wasm_v128_store(a_, a.val);
2539
c_[0] = (double)(a_[0]);
2540
c_[1] = (double)(a_[1]);
2547
inline
v_int8x16
v_lut(
const
schar* tab,
const
int* idx)
2549
return
v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2550
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2552
inline
v_int8x16
v_lut_pairs(
const
schar* tab,
const
int* idx)
2554
return
v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2555
tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2557
inline
v_int8x16
v_lut_quads(
const
schar* tab,
const
int* idx)
2559
return
v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2560
tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2562
inline
v_uint8x16
v_lut(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut((
const
schar *)tab, idx)); }
2563
inline
v_uint8x16
v_lut_pairs(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_pairs((
const
schar *)tab, idx)); }
2564
inline
v_uint8x16
v_lut_quads(
const
uchar* tab,
const
int* idx) {
return
v_reinterpret_as_u8(v_lut_quads((
const
schar *)tab, idx)); }
2566
inline
v_int16x8
v_lut(
const
short* tab,
const
int* idx)
2568
return
v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2569
tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2571
inline
v_int16x8
v_lut_pairs(
const
short* tab,
const
int* idx)
2573
return
v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2574
tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2576
inline
v_int16x8
v_lut_quads(
const
short* tab,
const
int* idx)
2578
return
v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2579
tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2581
inline
v_uint16x8
v_lut(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut((
const
short
*)tab, idx)); }
2582
inline
v_uint16x8
v_lut_pairs(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_pairs((
const
short
*)tab, idx)); }
2583
inline
v_uint16x8
v_lut_quads(
const
ushort* tab,
const
int* idx) {
return
v_reinterpret_as_u16(v_lut_quads((
const
short
*)tab, idx)); }
2585
inline
v_int32x4
v_lut(
const
int* tab,
const
int* idx)
2587
return
v_int32x4(tab[idx[0]], tab[idx[1]],
2588
tab[idx[2]], tab[idx[3]]);
2590
inline
v_int32x4
v_lut_pairs(
const
int* tab,
const
int* idx)
2592
return
v_int32x4(tab[idx[0]], tab[idx[0]+1],
2593
tab[idx[1]], tab[idx[1]+1]);
2595
inline
v_int32x4
v_lut_quads(
const
int* tab,
const
int* idx)
2597
return
v_int32x4(wasm_v128_load(tab + idx[0]));
2599
inline
v_uint32x4
v_lut(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idx)); }
2600
inline
v_uint32x4
v_lut_pairs(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_pairs((
const
int
*)tab, idx)); }
2601
inline
v_uint32x4
v_lut_quads(
const
unsigned* tab,
const
int* idx) {
return
v_reinterpret_as_u32(v_lut_quads((
const
int
*)tab, idx)); }
2603
inline
v_int64x2
v_lut(
const
int64_t* tab,
const
int* idx)
2605
return
v_int64x2(tab[idx[0]], tab[idx[1]]);
2607
inline
v_int64x2
v_lut_pairs(
const
int64_t* tab,
const
int* idx)
2609
return
v_int64x2(wasm_v128_load(tab + idx[0]));
2611
inline
v_uint64x2
v_lut(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut((
const
int64_t *)tab, idx)); }
2612
inline
v_uint64x2
v_lut_pairs(
const
uint64_t* tab,
const
int* idx) {
return
v_reinterpret_as_u64(v_lut_pairs((
const
int64_t *)tab, idx)); }
2614
inline
v_float32x4
v_lut(
const
float* tab,
const
int* idx)
2616
return
v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2618
inline
v_float32x4
v_lut_pairs(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v_lut_pairs((
const
int
*)tab, idx)); }
2619
inline
v_float32x4
v_lut_quads(
const
float* tab,
const
int* idx) {
return
v_reinterpret_as_f32(v_lut_quads((
const
int
*)tab, idx)); }
2621
inline
v_float64x2
v_lut(
const
double* tab,
const
int* idx)
2625
inline
v_float64x2
v_lut_pairs(
const
double* tab,
const
int* idx)
2632
return
v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2633
tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2634
tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2635
tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2640
return
v_reinterpret_as_u32(v_lut((
const
int
*)tab, idxvec));
2645
return
v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2646
tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2647
tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2648
tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2653
return
v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2654
tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2665
x =
v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2666
tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2667
tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2668
tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2669
y =
v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2670
tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2671
tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2672
tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2677
v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2678
v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2679
x.val = wasm_unpacklo_i64x2(xy0, xy1);
2680
y.val = wasm_unpacklo_i64x2(xy0, xy1);
2685
return
v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2687
inline
v_uint8x16
v_interleave_pairs(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2690
return
v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2692
inline
v_uint8x16
v_interleave_quads(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2696
return
v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2698
inline
v_uint16x8
v_interleave_pairs(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2701
return
v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2703
inline
v_uint16x8
v_interleave_quads(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2707
return
v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2709
inline
v_uint32x4
v_interleave_pairs(
const
v_uint32x4& vec) {
return
v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2712
return
v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2717
return
v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2719
inline
v_uint8x16
v_pack_triplets(
const
v_uint8x16& vec) {
return
v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2723
return
v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2725
inline
v_uint16x8
v_pack_triplets(
const
v_uint16x8& vec) {
return
v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2731
template<
int
i,
typename
_Tp>
2732
inline
typename
_Tp::lane_type
v_extract_n(
const
_Tp& a)
2734
return
v_rotate_right<i>(a).get0();
2740
return
v_setall_u32(v_extract_n<i>(a));
2745
return
v_setall_s32(v_extract_n<i>(a));
2750
return
v_setall_f32(v_extract_n<i>(a));
2759
for
(
int
i = 0; i < 4; i++)
2764
inline
void
v_pack_store(float16_t* ptr,
const
v_float32x4& v)
2767
wasm_v128_store(v_, v.val);
2768
ptr[0] = float16_t(v_[0]);
2769
ptr[1] = float16_t(v_[1]);
2770
ptr[2] = float16_t(v_[2]);
2771
ptr[3] = float16_t(v_[3]);
2774
inline
void
v_cleanup() {}
2776CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition:
intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition:
intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition:
intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition:
intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition:
intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition:
intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition:
intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition:
intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition:
intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition:
intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition:
intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition:
intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition:
intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition:
intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition:
intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition:
intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition:
intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition:
intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition:
intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition:
intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition:
intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition:
intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition:
intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition:
intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition:
intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition:
intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition:
intrin_cpp.hpp:2416
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition:
intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition:
intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition:
intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition:
intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition:
intrin_cpp.hpp:2118
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition:
intrin_cpp.hpp:2764
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition:
intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition:
intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition:
intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition:
intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition:
intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition:
intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition:
intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition:
intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition:
intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition:
intrin_cpp.hpp:827
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition:
intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition:
intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition:
intrin_cpp.hpp:2374
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition:
intrin_cpp.hpp:2046
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer
Definition:
fast_math.hpp:200
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition:
fast_math.hpp:254
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition:
fast_math.hpp:234
"black box" representation of the file storage associated with a file on disk.
Definition:
aruco.hpp:75