OpenCV 4.5.3(日本語機械翻訳)
intrin_wasm.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_HAL_INTRIN_WASM_HPP
6 #define OPENCV_HAL_INTRIN_WASM_HPP
7
8 #include <limits>
9 #include <cstring>
10 #include <algorithm>
11 #include "opencv2/core/saturate.hpp"
12
13 #define CV_SIMD128 1
14 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
15 #define CV_SIMD128_FP16 0
16
17 namespace cv
18{
19
21
22CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
23
24 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
25 // handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
26 #define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
27 #define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
28 #define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
29 #define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
30 #define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
31 #define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
32 #define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
33 #define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
34 #endif // COMPATIBILITY: <1.38.46
35
37
38 struct v_uint8x16
39{
40 typedef uchar lane_type;
41 typedef v128_t vector_type;
42 enum { nlanes = 16 };
43
44 v_uint8x16() {}
45 explicit v_uint8x16(v128_t v) : val(v) {}
46 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
47 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
48 {
49 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
50 val = wasm_v128_load(v);
51 }
52
53 uchar get0() const
54 {
55 return (uchar)wasm_i8x16_extract_lane(val, 0);
56 }
57
58 v128_t val;
59};
60
61 struct v_int8x16
62{
63 typedef schar lane_type;
64 typedef v128_t vector_type;
65 enum { nlanes = 16 };
66
67 v_int8x16() {}
68 explicit v_int8x16(v128_t v) : val(v) {}
69 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
70 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
71 {
72 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
73 val = wasm_v128_load(v);
74 }
75
76 schar get0() const
77 {
78 return wasm_i8x16_extract_lane(val, 0);
79 }
80
81 v128_t val;
82};
83
84 struct v_uint16x8
85{
86 typedef ushort lane_type;
87 typedef v128_t vector_type;
88 enum { nlanes = 8 };
89
90 v_uint16x8() {}
91 explicit v_uint16x8(v128_t v) : val(v) {}
92 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
93 {
94 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
95 val = wasm_v128_load(v);
96 }
97
98 ushort get0() const
99 {
100 return (ushort)wasm_i16x8_extract_lane(val, 0); // wasm_u16x8_extract_lane() unimplemented yet
101 }
102
103 v128_t val;
104};
105
106 struct v_int16x8
107{
108 typedef short lane_type;
109 typedef v128_t vector_type;
110 enum { nlanes = 8 };
111
112 v_int16x8() {}
113 explicit v_int16x8(v128_t v) : val(v) {}
114 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
115 {
116 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
117 val = wasm_v128_load(v);
118 }
119
120 short get0() const
121 {
122 return wasm_i16x8_extract_lane(val, 0);
123 }
124
125 v128_t val;
126};
127
128 struct v_uint32x4
129{
130 typedef unsigned lane_type;
131 typedef v128_t vector_type;
132 enum { nlanes = 4 };
133
134 v_uint32x4() {}
135 explicit v_uint32x4(v128_t v) : val(v) {}
136 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
137 {
138 unsigned v[] = {v0, v1, v2, v3};
139 val = wasm_v128_load(v);
140 }
141
142 unsigned get0() const
143 {
144 return (unsigned)wasm_i32x4_extract_lane(val, 0);
145 }
146
147 v128_t val;
148};
149
150 struct v_int32x4
151{
152 typedef int lane_type;
153 typedef v128_t vector_type;
154 enum { nlanes = 4 };
155
156 v_int32x4() {}
157 explicit v_int32x4(v128_t v) : val(v) {}
158 v_int32x4(int v0, int v1, int v2, int v3)
159 {
160 int v[] = {v0, v1, v2, v3};
161 val = wasm_v128_load(v);
162 }
163
164 int get0() const
165 {
166 return wasm_i32x4_extract_lane(val, 0);
167 }
168
169 v128_t val;
170};
171
172 struct v_float32x4
173{
174 typedef float lane_type;
175 typedef v128_t vector_type;
176 enum { nlanes = 4 };
177
178 v_float32x4() {}
179 explicit v_float32x4(v128_t v) : val(v) {}
180 v_float32x4(float v0, float v1, float v2, float v3)
181 {
182 float v[] = {v0, v1, v2, v3};
183 val = wasm_v128_load(v);
184 }
185
186 float get0() const
187 {
188 return wasm_f32x4_extract_lane(val, 0);
189 }
190
191 v128_t val;
192};
193
194 struct v_uint64x2
195{
196 typedef uint64 lane_type;
197 typedef v128_t vector_type;
198 enum { nlanes = 2 };
199
200 v_uint64x2() {}
201 explicit v_uint64x2(v128_t v) : val(v) {}
202 v_uint64x2(uint64 v0, uint64 v1)
203 {
204 uint64 v[] = {v0, v1};
205 val = wasm_v128_load(v);
206 }
207
208 uint64 get0() const
209 {
210 return (uint64)wasm_i64x2_extract_lane(val, 0);
211 }
212
213 v128_t val;
214};
215
216 struct v_int64x2
217{
218 typedef int64 lane_type;
219 typedef v128_t vector_type;
220 enum { nlanes = 2 };
221
222 v_int64x2() {}
223 explicit v_int64x2(v128_t v) : val(v) {}
224 v_int64x2(int64 v0, int64 v1)
225 {
226 int64 v[] = {v0, v1};
227 val = wasm_v128_load(v);
228 }
229
230 int64 get0() const
231 {
232 return wasm_i64x2_extract_lane(val, 0);
233 }
234
235 v128_t val;
236};
237
238 struct v_float64x2
239{
240 typedef double lane_type;
241 typedef v128_t vector_type;
242 enum { nlanes = 2 };
243
244 v_float64x2() {}
245 explicit v_float64x2(v128_t v) : val(v) {}
246 v_float64x2(double v0, double v1)
247 {
248 double v[] = {v0, v1};
249 val = wasm_v128_load(v);
250 }
251
252 double get0() const
253 {
254 return wasm_f64x2_extract_lane(val, 0);
255 }
256
257 v128_t val;
258};
259
260 namespace
261{
262 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
263 inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
264OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
265OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
266OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
267OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
268OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
269OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
270OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
271OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
272OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
273OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
274
275 static const unsigned char popCountTable[] =
276{
277 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
278 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
279 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
281 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
282 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
283 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
285 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
286 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
287 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
289 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
290 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
291 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
293};
294} // namespace
295
296 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
297 return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
298}
299
300 static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
301 return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
302}
303
304 static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
305 return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
306}
307
308 static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
309 return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
310}
311
312 static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
313 return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
314}
315
316 static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
317 return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
318}
319
320 static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
321 return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
322}
323
324 static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
325 return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
326}
327
329 // 8 >> 16
330 inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
331{
332 const v128_t z = wasm_i8x16_splat(0);
333 return wasm_unpacklo_i8x16(a, z);
334}
335 inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
336{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
337 // 8 >> 32
338 inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
339{
340 const v128_t z = wasm_i8x16_splat(0);
341 return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
342}
343 inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
344{
345 v128_t r = wasm_unpacklo_i8x16(a, a);
346 r = wasm_unpacklo_i8x16(r, r);
347 return wasm_i32x4_shr(r, 24);
348}
349 // 16 >> 32
350 inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
351{
352 const v128_t z = wasm_i8x16_splat(0);
353 return wasm_unpacklo_i16x8(a, z);
354}
355 inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
356{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
357 // 32 >> 64
358 inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
359{
360 const v128_t z = wasm_i8x16_splat(0);
361 return wasm_unpacklo_i32x4(a, z);
362}
363 inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
364{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
365
366 // 16 << 8
367 inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
368{
369 const v128_t z = wasm_i8x16_splat(0);
370 return wasm_unpackhi_i8x16(a, z);
371}
372 inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
373{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
374 // 32 << 16
375 inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
376{
377 const v128_t z = wasm_i8x16_splat(0);
378 return wasm_unpackhi_i16x8(a, z);
379}
380 inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
381{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
382 // 64 << 32
383 inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
384{
385 const v128_t z = wasm_i8x16_splat(0);
386 return wasm_unpackhi_i32x4(a, z);
387}
388 inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
389{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
390
391 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
392 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
393 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
394 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
395 { return _Tpvec(a.val); }
396
397OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
398OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
399OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
400OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
401OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
402OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
403OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
404OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
405OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
406OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
407
408
409 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
410{
411 v128_t maxval = wasm_i16x8_splat(255);
412 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
413 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
414 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
415}
416 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
417{
418 v128_t maxval = wasm_i16x8_splat(127);
419 v128_t minval = wasm_i16x8_splat(-128);
420 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
421 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
422 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
423 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
424 return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
425}
426 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
427{
428 v128_t maxval = wasm_i32x4_splat(65535);
429 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
430 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
431 return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
432}
433 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
434{
435 v128_t maxval = wasm_i32x4_splat(32767);
436 v128_t minval = wasm_i32x4_splat(-32768);
437 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
438 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
439 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
440 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
441 return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
442}
443 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
444{
445 return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
446}
447 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
448{
449 return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
450}
451 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
452{
453 v128_t maxval = wasm_i16x8_splat(255);
454 v128_t minval = wasm_i16x8_splat(0);
455 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
456 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
457 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
458 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
459 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
460}
461 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
462{
463 v128_t maxval = wasm_i32x4_splat(65535);
464 v128_t minval = wasm_i32x4_splat(0);
465 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
466 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
467 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
468 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
469 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
470}
471
472 template<int n>
473 inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
474{
475 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
476 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
477 v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
478 v128_t maxval = wasm_i16x8_splat(255);
479 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
480 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
481 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
482}
483 template<int n>
484 inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
485{
486 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
487 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
488 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
489 v128_t maxval = wasm_i16x8_splat(127);
490 v128_t minval = wasm_i16x8_splat(-128);
491 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
492 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
493 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
494 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
495 return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
496}
497 template<int n>
498 inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
499{
500 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
501 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
502 v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
503 v128_t maxval = wasm_i32x4_splat(65535);
504 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
505 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
506 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
507}
508 template<int n>
509 inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
510{
511 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
512 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
513 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
514 v128_t maxval = wasm_i32x4_splat(32767);
515 v128_t minval = wasm_i16x8_splat(-32768);
516 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
517 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
518 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
519 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
520 return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
521}
522 template<int n>
523 inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
524{
525 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
526 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
527 v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
528 return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
529}
530 template<int n>
531 inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
532{
533 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
534 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
535 v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
536 return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
537}
538 template<int n>
539 inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
540{
541 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
542 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
543 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
544 v128_t maxval = wasm_i16x8_splat(255);
545 v128_t minval = wasm_i16x8_splat(0);
546 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
547 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
548 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
549 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
550 return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
551}
552 template<int n>
553 inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
554{
555 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
556 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
557 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
558 v128_t maxval = wasm_i32x4_splat(65535);
559 v128_t minval = wasm_i16x8_splat(0);
560 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
561 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
562 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
563 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
564 return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
565}
566
567 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
568{
569 v128_t maxval = wasm_i16x8_splat(255);
570 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
571 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
572 uchar t_ptr[16];
573 wasm_v128_store(t_ptr, r);
574 for (int i=0; i<8; ++i) {
575 ptr[i] = t_ptr[i];
576 }
577}
578 inline void v_pack_store(schar* ptr, const v_int16x8& a)
579{
580 v128_t maxval = wasm_i16x8_splat(127);
581 v128_t minval = wasm_i16x8_splat(-128);
582 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
583 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
584 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
585 schar t_ptr[16];
586 wasm_v128_store(t_ptr, r);
587 for (int i=0; i<8; ++i) {
588 ptr[i] = t_ptr[i];
589 }
590}
591 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
592{
593 v128_t maxval = wasm_i32x4_splat(65535);
594 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
595 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
596 ushort t_ptr[8];
597 wasm_v128_store(t_ptr, r);
598 for (int i=0; i<4; ++i) {
599 ptr[i] = t_ptr[i];
600 }
601}
602 inline void v_pack_store(short* ptr, const v_int32x4& a)
603{
604 v128_t maxval = wasm_i32x4_splat(32767);
605 v128_t minval = wasm_i32x4_splat(-32768);
606 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
607 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
608 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
609 short t_ptr[8];
610 wasm_v128_store(t_ptr, r);
611 for (int i=0; i<4; ++i) {
612 ptr[i] = t_ptr[i];
613 }
614}
615 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
616{
617 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
618 unsigned t_ptr[4];
619 wasm_v128_store(t_ptr, r);
620 for (int i=0; i<2; ++i) {
621 ptr[i] = t_ptr[i];
622 }
623}
624 inline void v_pack_store(int* ptr, const v_int64x2& a)
625{
626 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
627 int t_ptr[4];
628 wasm_v128_store(t_ptr, r);
629 for (int i=0; i<2; ++i) {
630 ptr[i] = t_ptr[i];
631 }
632}
633 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
634{
635 v128_t maxval = wasm_i16x8_splat(255);
636 v128_t minval = wasm_i16x8_splat(0);
637 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
638 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
639 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
640 uchar t_ptr[16];
641 wasm_v128_store(t_ptr, r);
642 for (int i=0; i<8; ++i) {
643 ptr[i] = t_ptr[i];
644 }
645}
646 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
647{
648 v128_t maxval = wasm_i32x4_splat(65535);
649 v128_t minval = wasm_i32x4_splat(0);
650 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
651 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
652 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
653 ushort t_ptr[8];
654 wasm_v128_store(t_ptr, r);
655 for (int i=0; i<4; ++i) {
656 ptr[i] = t_ptr[i];
657 }
658}
659
660 template<int n>
661 inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
662{
663 v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
664 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
665 v128_t maxval = wasm_i16x8_splat(255);
666 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
667 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
668 uchar t_ptr[16];
669 wasm_v128_store(t_ptr, r);
670 for (int i=0; i<8; ++i) {
671 ptr[i] = t_ptr[i];
672 }
673}
674 template<int n>
675 inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
676{
677 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
678 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
679 v128_t maxval = wasm_i16x8_splat(127);
680 v128_t minval = wasm_i16x8_splat(-128);
681 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
682 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
683 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
684 schar t_ptr[16];
685 wasm_v128_store(t_ptr, r);
686 for (int i=0; i<8; ++i) {
687 ptr[i] = t_ptr[i];
688 }
689}
690 template<int n>
691 inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
692{
693 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
694 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
695 v128_t maxval = wasm_i32x4_splat(65535);
696 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
697 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
698 ushort t_ptr[8];
699 wasm_v128_store(t_ptr, r);
700 for (int i=0; i<4; ++i) {
701 ptr[i] = t_ptr[i];
702 }
703}
704 template<int n>
705 inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
706{
707 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
708 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
709 v128_t maxval = wasm_i32x4_splat(32767);
710 v128_t minval = wasm_i32x4_splat(-32768);
711 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
712 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
713 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
714 short t_ptr[8];
715 wasm_v128_store(t_ptr, r);
716 for (int i=0; i<4; ++i) {
717 ptr[i] = t_ptr[i];
718 }
719}
720 template<int n>
721 inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
722{
723 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
724 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
725 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
726 unsigned t_ptr[4];
727 wasm_v128_store(t_ptr, r);
728 for (int i=0; i<2; ++i) {
729 ptr[i] = t_ptr[i];
730 }
731}
732 template<int n>
733 inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
734{
735 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
736 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
737 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
738 int t_ptr[4];
739 wasm_v128_store(t_ptr, r);
740 for (int i=0; i<2; ++i) {
741 ptr[i] = t_ptr[i];
742 }
743}
744 template<int n>
745 inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
746{
747 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
748 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
749 v128_t maxval = wasm_i16x8_splat(255);
750 v128_t minval = wasm_i16x8_splat(0);
751 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
752 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
753 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
754 uchar t_ptr[16];
755 wasm_v128_store(t_ptr, r);
756 for (int i=0; i<8; ++i) {
757 ptr[i] = t_ptr[i];
758 }
759}
760 template<int n>
761 inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
762{
763 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
764 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
765 v128_t maxval = wasm_i32x4_splat(65535);
766 v128_t minval = wasm_i32x4_splat(0);
767 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
768 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
769 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
770 ushort t_ptr[8];
771 wasm_v128_store(t_ptr, r);
772 for (int i=0; i<4; ++i) {
773 ptr[i] = t_ptr[i];
774 }
775}
776
777 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
778{
779 v128_t maxval = wasm_i16x8_splat(255);
780 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
781 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
782 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
783}
784
785 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
786 const v_uint32x4& c, const v_uint32x4& d)
787{
788 v128_t maxval = wasm_i32x4_splat(255);
789 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
790 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
791 v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
792 v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
793 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
794 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795 return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
796}
797
798 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
799 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
800 const v_uint64x2& g, const v_uint64x2& h)
801{
802 v128_t maxval = wasm_i32x4_splat(255);
803 v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
804 v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
805 v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
806 v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
807 v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
808 v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
809 v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
810 v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
811 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
812 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813 v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814 v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815 v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
816 v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817 return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
818}
819
820 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
821 const v_float32x4& m1, const v_float32x4& m2,
822 const v_float32x4& m3)
823{
824 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
825 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
826 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
827 v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
828 v0 = wasm_f32x4_mul(v0, m0.val);
829 v1 = wasm_f32x4_mul(v1, m1.val);
830 v2 = wasm_f32x4_mul(v2, m2.val);
831 v3 = wasm_f32x4_mul(v3, m3.val);
832
833 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
834}
835
836 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
837 const v_float32x4& m1, const v_float32x4& m2,
838 const v_float32x4& a)
839{
840 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
841 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
842 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
843 v0 = wasm_f32x4_mul(v0, m0.val);
844 v1 = wasm_f32x4_mul(v1, m1.val);
845 v2 = wasm_f32x4_mul(v2, m2.val);
846
847 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
848}
849
850 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
851 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
852 { \
853 return _Tpvec(intrin(a.val, b.val)); \
854 } \
855 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
856 { \
857 a.val = intrin(a.val, b.val); \
858 return a; \
859 }
860
861OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
862OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
863OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
864OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
865OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
866OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
867OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
868OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
869OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
870OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
871OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
872OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
873OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
874OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
875OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
876OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
877OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
878OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
879OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
880OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
881OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
882OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
883OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
884OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
885OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
886OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
887
888 // saturating multiply 8-bit, 16-bit
889 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
890 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
891 { \
892 _Tpwvec c, d; \
893 v_mul_expand(a, b, c, d); \
894 return v_pack(c, d); \
895 } \
896 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
897 { a = a * b; return a; }
898
899OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
900OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8)
901OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
902OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8, v_int32x4)
903
904 // Multiply and expand
905 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
906 v_uint16x8& c, v_uint16x8& d)
907{
908 v_uint16x8 a0, a1, b0, b1;
909 v_expand(a, a0, a1);
910 v_expand(b, b0, b1);
911 c = v_mul_wrap(a0, b0);
912 d = v_mul_wrap(a1, b1);
913}
914
915 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
916 v_int16x8& c, v_int16x8& d)
917{
918 v_int16x8 a0, a1, b0, b1;
919 v_expand(a, a0, a1);
920 v_expand(b, b0, b1);
921 c = v_mul_wrap(a0, b0);
922 d = v_mul_wrap(a1, b1);
923}
924
925 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
926 v_int32x4& c, v_int32x4& d)
927{
928 v_int32x4 a0, a1, b0, b1;
929 v_expand(a, a0, a1);
930 v_expand(b, b0, b1);
931 c.val = wasm_i32x4_mul(a0.val, b0.val);
932 d.val = wasm_i32x4_mul(a1.val, b1.val);
933}
934
935 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
936 v_uint32x4& c, v_uint32x4& d)
937{
938 v_uint32x4 a0, a1, b0, b1;
939 v_expand(a, a0, a1);
940 v_expand(b, b0, b1);
941 c.val = wasm_i32x4_mul(a0.val, b0.val);
942 d.val = wasm_i32x4_mul(a1.val, b1.val);
943}
944
945 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
946 v_uint64x2& c, v_uint64x2& d)
947{
948 v_uint64x2 a0, a1, b0, b1;
949 v_expand(a, a0, a1);
950 v_expand(b, b0, b1);
951 c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
952 d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
953}
954
955 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
956{
957 v_int32x4 a0, a1, b0, b1;
958 v_expand(a, a0, a1);
959 v_expand(b, b0, b1);
960 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
961 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
962 return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
963}
964 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
965{
966 v_uint32x4 a0, a1, b0, b1;
967 v_expand(a, a0, a1);
968 v_expand(b, b0, b1);
969 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
970 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
971 return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
972}
973
975
976 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
977{
978 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
979 v128_t a1 = wasm_i32x4_shr(a.val, 16);
980 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
981 v128_t b1 = wasm_i32x4_shr(b.val, 16);
982 v128_t c = wasm_i32x4_mul(a0, b0);
983 v128_t d = wasm_i32x4_mul(a1, b1);
984 return v_int32x4(wasm_i32x4_add(c, d));
985}
986
987 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
988{ return v_dotprod(a, b) + c; }
989
990 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
991{
992 v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
993 v128_t a1 = wasm_i64x2_shr(a.val, 32);
994 v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
995 v128_t b1 = wasm_i64x2_shr(b.val, 32);
996 v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
997 v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
998 return v_int64x2(wasm_i64x2_add(c, d));
999}
1000 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1001{
1002 return v_dotprod(a, b) + c;
1003}
1004
1005 // 8 >> 32
1006 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1007{
1008 v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1009 v128_t a1 = wasm_u16x8_shr(a.val, 8);
1010 v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1011 v128_t b1 = wasm_u16x8_shr(b.val, 8);
1012 return v_uint32x4((
1013 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1014 v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
1015 );
1016}
1017 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1018{ return v_dotprod_expand(a, b) + c; }
1019
1020 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1021{
1022 v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1023 v128_t a1 = wasm_i16x8_shr(a.val, 8);
1024 v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1025 v128_t b1 = wasm_i16x8_shr(b.val, 8);
1026 return v_int32x4(
1027 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1028 v_dotprod(v_int16x8(a1), v_int16x8(b1))
1029 );
1030}
1031 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1032{ return v_dotprod_expand(a, b) + c; }
1033
1034 // 16 >> 64
1035 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1036{
1037 v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1038 v128_t a1 = wasm_u32x4_shr(a.val, 16);
1039 v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1040 v128_t b1 = wasm_u32x4_shr(b.val, 16);
1041 return v_uint64x2((
1042 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1043 v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
1044 );
1045}
1046 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1047{ return v_dotprod_expand(a, b) + c; }
1048
1049 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1050{
1051 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1052 v128_t a1 = wasm_i32x4_shr(a.val, 16);
1053 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1054 v128_t b1 = wasm_i32x4_shr(b.val, 16);
1055 return v_int64x2((
1056 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1057 v_dotprod(v_int32x4(a1), v_int32x4(b1)))
1058 );
1059}
1060
1061 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1062{ return v_dotprod_expand(a, b) + c; }
1063
1064 // 32 >> 64f
1065 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1066{ return v_cvt_f64(v_dotprod(a, b)); }
1067 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1068{ return v_dotprod_expand(a, b) + c; }
1069
1071
1072 // 16 >> 32
1073 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1074{ return v_dotprod(a, b); }
1075 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1076{ return v_dotprod(a, b, c); }
1077
1078 // 32 >> 64
1079 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1080{ return v_dotprod(a, b); }
1081 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1082{ return v_dotprod(a, b, c); }
1083
1084 // 8 >> 32
1085 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1086{ return v_dotprod_expand(a, b); }
1087 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1088{ return v_dotprod_expand(a, b, c); }
1089 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1090{ return v_dotprod_expand(a, b); }
1091 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1092{ return v_dotprod_expand(a, b, c); }
1093
1094 // 16 >> 64
1095 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1096{ return v_dotprod_expand(a, b); }
1097 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1098{ return v_dotprod_expand(a, b, c); }
1099 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1100{ return v_dotprod_expand(a, b); }
1101 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1102{ return v_dotprod_expand(a, b, c); }
1103
1104 // 32 >> 64f
1105 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1106{ return v_dotprod_expand(a, b); }
1107 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1108{ return v_dotprod_expand(a, b, c); }
1109
1110 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1111 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1112 OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1113 OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1114 inline _Tpvec operator ~ (const _Tpvec& a) \
1115 { \
1116 return _Tpvec(wasm_v128_not(a.val)); \
1117 }
1118
1119OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
1120OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
1121OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
1122OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
1123OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
1124OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
1125OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
1126OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
1127OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
1128OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
1129
1130 inline v_float32x4 v_sqrt(const v_float32x4& x)
1131{
1132 return v_float32x4(wasm_f32x4_sqrt(x.val));
1133}
1134
1135 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1136{
1137 const v128_t _1_0 = wasm_f32x4_splat(1.0);
1138 return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1139}
1140
1141 inline v_float64x2 v_sqrt(const v_float64x2& x)
1142{
1143 return v_float64x2(wasm_f64x2_sqrt(x.val));
1144}
1145
1146 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1147{
1148 const v128_t _1_0 = wasm_f64x2_splat(1.0);
1149 return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1150}
1151
1152 #define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1153 inline _Tpuvec v_abs(const _Tpsvec& x) \
1154 { \
1155 v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1156 v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1157 return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1158 }
1159
1160OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
1161OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
1162OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
1163
1164inline v_float32x4 v_abs(const v_float32x4& x)
1165{ return v_float32x4(wasm_f32x4_abs(x.val)); }
1166 inline v_float64x2 v_abs(const v_float64x2& x)
1167{
1168 return v_float64x2(wasm_f64x2_abs(x.val));
1169}
1170
1171 // TODO: exp, log, sin, cos
1172
1173 #define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1174 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176 return _Tpvec(intrin(a.val, b.val)); \
1177 }
1178
1179OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
1180OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
1181OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
1182OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
1183
1184 #define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1185 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1186 { \
1187 return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1188 } \
1189 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1190 { \
1191 return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1192 }
1193
1194OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
1195OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
1196OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
1197
1198 #define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1199 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1200 { \
1201 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1202 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1203 return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1204 } \
1205 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1206 { \
1207 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1208 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1209 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1210 }
1211
1212OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
1213OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
1214OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
1215
1216 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1217 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1218 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1219 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1220 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1221 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1222 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1223 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1224 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1225 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1226 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1227 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1228 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1229
1230OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
1231OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
1232OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
1233OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
1234OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
1235OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
1236OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
1237OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
1238
1239 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1240 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1241 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1242 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1243 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1244
1245OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1246OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1247
1248 inline v_float32x4 v_not_nan(const v_float32x4& a)
1249{
1250 v128_t z = wasm_i32x4_splat(0x7fffffff);
1251 v128_t t = wasm_i32x4_splat(0x7f800000);
1252 return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1253}
1254 inline v_float64x2 v_not_nan(const v_float64x2& a)
1255{
1256 v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1257 v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1258 return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1259}
1260
1261OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
1262OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
1263OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
1264OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
1265OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1266OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1267OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1268OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1269 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1270 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
1271 // 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
1272 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1273{
1274 uchar a_[16], b_[16];
1275 wasm_v128_store(a_, a.val);
1276 wasm_v128_store(b_, b.val);
1277 for (int i = 0; i < 16; i++)
1278 a_[i] = (uchar)(a_[i] * b_[i]);
1279 return v_uint8x16(wasm_v128_load(a_));
1280}
1281 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1282{
1283 schar a_[16], b_[16];
1284 wasm_v128_store(a_, a.val);
1285 wasm_v128_store(b_, b.val);
1286 for (int i = 0; i < 16; i++)
1287 a_[i] = (schar)(a_[i] * b_[i]);
1288 return v_int8x16(wasm_v128_load(a_));
1289}
1290 #else
1291OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1292OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1293 #endif
1294OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1295OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1296
1297
1298
1300 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1301{ return v_add_wrap(a - b, b - a); }
1302 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1303{ return v_add_wrap(a - b, b - a); }
1304 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1305{ return v_max(a, b) - v_min(a, b); }
1306
1307 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1308{
1309 v_int8x16 d = v_sub_wrap(a, b);
1310 v_int8x16 m = a < b;
1311 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1312}
1313 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1314{
1315 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1316}
1317 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1318{
1319 v_int32x4 d = a - b;
1320 v_int32x4 m = a < b;
1321 return v_reinterpret_as_u32((d ^ m) - m);
1322}
1323
1325 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1326{
1327 v_int8x16 d = a - b;
1328 v_int8x16 m = a < b;
1329 return (d ^ m) - m;
1330 }
1331 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1332{ return v_max(a, b) - v_min(a, b); }
1333
1334
1335 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1336{
1337 return a * b + c;
1338}
1339
1340 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1341{
1342 return v_fma(a, b, c);
1343}
1344
1345 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1346{
1347 return a * b + c;
1348}
1349
1350 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1351{
1352 return a * b + c;
1353}
1354
1355 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1356{
1357 v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1358 return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1359}
1360 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1361{
1362 v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1363 return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1364}
1365
1366 #define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1367 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1368 { \
1369 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1370 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1371 return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1372 } \
1373 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1374 { \
1375 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1376 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1377 return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381 return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1382 }
1383
1384OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
1385OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
1386
1387 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1423 }
1424
1425OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
1426OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
1427OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
1428OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
1429
1430 namespace hal_wasm_internal
1431{
1432 template <int imm,
1433 bool is_invalid = ((imm < 0) || (imm > 16)),
1434 bool is_first = (imm == 0),
1435 bool is_second = (imm == 16),
1436 bool is_other = (((imm > 0) && (imm < 16)))>
1437 class v_wasm_palignr_u8_class;
1438
1439 template <int imm>
1440 class v_wasm_palignr_u8_class<imm, true, false, false, false>;
1441
1442 template <int imm>
1443 class v_wasm_palignr_u8_class<imm, false, true, false, false>
1444 {
1445 public:
1446 inline v128_t operator()(const v128_t& a, const v128_t&) const
1447 {
1448 return a;
1449 }
1450 };
1451
1452 template <int imm>
1453 class v_wasm_palignr_u8_class<imm, false, false, true, false>
1454 {
1455 public:
1456 inline v128_t operator()(const v128_t&, const v128_t& b) const
1457 {
1458 return b;
1459 }
1460 };
1461
1462 template <int imm>
1463 class v_wasm_palignr_u8_class<imm, false, false, false, true>
1464 {
1465 public:
1466 inline v128_t operator()(const v128_t& a, const v128_t& b) const
1467 {
1468 enum { imm2 = (sizeof(v128_t) - imm) };
1469 return wasm_v8x16_shuffle(a, b,
1470 imm, imm+1, imm+2, imm+3,
1471 imm+4, imm+5, imm+6, imm+7,
1472 imm+8, imm+9, imm+10, imm+11,
1473 imm+12, imm+13, imm+14, imm+15);
1474 }
1475 };
1476
1477 template <int imm>
1478 inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
1479 {
1480 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
1481 return v_wasm_palignr_u8_class<imm>()(a, b);
1482 }
1483}
1484
1485 template<int imm, typename _Tpvec>
1486 inline _Tpvec v_rotate_right(const _Tpvec &a)
1487{
1488 using namespace hal_wasm_internal;
1489 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1490 v128_t z = wasm_i8x16_splat(0);
1491 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1492}
1493
1494 template<int imm, typename _Tpvec>
1495 inline _Tpvec v_rotate_left(const _Tpvec &a)
1496{
1497 using namespace hal_wasm_internal;
1498 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1499 v128_t z = wasm_i8x16_splat(0);
1500 return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1501}
1502
1503 template<int imm, typename _Tpvec>
1504 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1505{
1506 using namespace hal_wasm_internal;
1507 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1508 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1509}
1510
1511 template<int imm, typename _Tpvec>
1512 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1513{
1514 using namespace hal_wasm_internal;
1515 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1516 return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1517}
1518
1519 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1520 inline _Tpvec v_load(const _Tp* ptr) \
1521 { return _Tpvec(wasm_v128_load(ptr)); } \
1522 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1523 { return _Tpvec(wasm_v128_load(ptr)); } \
1524 inline _Tpvec v_load_low(const _Tp* ptr) \
1525 { \
1526 _Tp tmp[_Tpvec::nlanes] = {0}; \
1527 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1528 tmp[i] = ptr[i]; \
1529 } \
1530 return _Tpvec(wasm_v128_load(tmp)); \
1531 } \
1532 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1533 { \
1534 _Tp tmp[_Tpvec::nlanes]; \
1535 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1536 tmp[i] = ptr0[i]; \
1537 tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1538 } \
1539 return _Tpvec(wasm_v128_load(tmp)); \
1540 } \
1541 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1542 { wasm_v128_store(ptr, a.val); } \
1543 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1544 { wasm_v128_store(ptr, a.val); } \
1545 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1546 { wasm_v128_store(ptr, a.val); } \
1547 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/ ) \
1548 { \
1549 wasm_v128_store(ptr, a.val); \
1550 } \
1551 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1552 { \
1553 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1554 wasm_v128_store(a_, a.val); \
1555 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1556 ptr[i] = a_[i]; \
1557 } \
1558 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1559 { \
1560 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1561 wasm_v128_store(a_, a.val); \
1562 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1563 ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1564 }
1565
1566OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
1567OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
1568OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
1569OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
1570OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1571OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
1572OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
1573OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
1574OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
1575OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
1576
1577
1578
1579 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1580{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1581
1582 inline v_int8x16 v_reverse(const v_int8x16 &a)
1583{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1584
1585 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1586{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1587
1588 inline v_int16x8 v_reverse(const v_int16x8 &a)
1589{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1590
1591 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1592{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1593
1594 inline v_int32x4 v_reverse(const v_int32x4 &a)
1595{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1596
1597 inline v_float32x4 v_reverse(const v_float32x4 &a)
1598{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1599
1600 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1601{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1602
1603 inline v_int64x2 v_reverse(const v_int64x2 &a)
1604{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1605
1606 inline v_float64x2 v_reverse(const v_float64x2 &a)
1607{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1608
1609
1610 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1611 inline scalartype v_reduce_sum(const _Tpvec& a) \
1612 { \
1613 regtype val = a.val; \
1614 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1615 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1616 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1617 }
1618
1619OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
1620OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
1621OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
1622
1623 // To do: Optimize v_reduce_sum with wasm intrin.
1624 // Now use fallback implementation as there is no widening op in wasm intrin.
1625
1626 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1627 inline scalartype v_reduce_sum(const _Tpvec& a) \
1628 { \
1629 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1630 wasm_v128_store(a_, a.val); \
1631 scalartype c = a_[0]; \
1632 for (int i = 1; i < _Tpvec::nlanes; i++) \
1633 c += a_[i]; \
1634 return c; \
1635 }
1636
1637OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
1638OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
1639OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
1640OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
1641
1642
1643 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1644 inline scalartype v_reduce_sum(const _Tpvec& a) \
1645 { \
1646 regtype val = a.val; \
1647 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1648 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1649 }
1650OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
1651OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64, v128_t, i64x2, i64x2)
1652OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double, v128_t, f64x2,f64x2)
1653
1654 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1655 const v_float32x4& c, const v_float32x4& d)
1656{
1657 v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1658 v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1659 return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1660}
1661
1662 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1663 inline scalartype v_reduce_##func(const _Tpvec& a) \
1664 { \
1665 scalartype buf[_Tpvec::nlanes]; \
1666 v_store(buf, a); \
1667 scalartype tmp = buf[0]; \
1668 for (int i=1; i<_Tpvec::nlanes; ++i) { \
1669 tmp = scalar_func(tmp, buf[i]); \
1670 } \
1671 return tmp; \
1672 }
1673
1674OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
1675OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
1676OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
1677OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
1678OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
1679OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
1680OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
1681OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
1682OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
1683OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
1684OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
1685OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
1686OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
1687OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
1688
1689 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1690{
1691 v_uint16x8 l16, h16;
1692 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1693 v_expand(v_absdiff(a, b), l16, h16);
1694 v_expand(l16, l16_l32, l16_h32);
1695 v_expand(h16, h16_l32, h16_h32);
1696 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1697}
1698 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1699{
1700 v_uint16x8 l16, h16;
1701 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1702 v_expand(v_absdiff(a, b), l16, h16);
1703 v_expand(l16, l16_l32, l16_h32);
1704 v_expand(h16, h16_l32, h16_h32);
1705 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1706}
1707 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1708{
1709 v_uint32x4 l, h;
1710 v_expand(v_absdiff(a, b), l, h);
1711 return v_reduce_sum(l + h);
1712}
1713 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1714{
1715 v_uint32x4 l, h;
1716 v_expand(v_absdiff(a, b), l, h);
1717 return v_reduce_sum(l + h);
1718}
1719 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1720{
1721 return v_reduce_sum(v_absdiff(a, b));
1722}
1723 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1724{
1725 return v_reduce_sum(v_absdiff(a, b));
1726}
1727 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1728{
1729 return v_reduce_sum(v_absdiff(a, b));
1730}
1731
1732 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1733{
1734 v128_t m1 = wasm_i32x4_splat(0x55555555);
1735 v128_t m2 = wasm_i32x4_splat(0x33333333);
1736 v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1737 v128_t p = a.val;
1738 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1739 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1740 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1741 return v_uint8x16(p);
1742}
1743 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1744{
1745 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1746 p += v_rotate_right<1>(p);
1747 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1748}
1749 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1750{
1751 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1752 p += v_rotate_right<1>(p);
1753 p += v_rotate_right<2>(p);
1754 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1755}
1756 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1757{
1758 uint64 a_[2], b_[2] = { 0 };
1759 wasm_v128_store(a_, a.val);
1760 for (int i = 0; i < 16; i++)
1761 b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1762 return v_uint64x2(wasm_v128_load(b_));
1763}
1764 inline v_uint8x16 v_popcount(const v_int8x16& a)
1765{ return v_popcount(v_reinterpret_as_u8(a)); }
1766 inline v_uint16x8 v_popcount(const v_int16x8& a)
1767{ return v_popcount(v_reinterpret_as_u16(a)); }
1768 inline v_uint32x4 v_popcount(const v_int32x4& a)
1769{ return v_popcount(v_reinterpret_as_u32(a)); }
1770 inline v_uint64x2 v_popcount(const v_int64x2& a)
1771{ return v_popcount(v_reinterpret_as_u64(a)); }
1772
1773 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1774 inline int v_signmask(const _Tpvec& a) \
1775 { \
1776 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1777 wasm_v128_store(a_, a.val); \
1778 int mask = 0; \
1779 for (int i = 0; i < _Tpvec::nlanes; i++) \
1780 mask |= (reinterpret_int(a_[i]) < 0) << i; \
1781 return mask; \
1782 } \
1783 inline bool v_check_all(const _Tpvec& a) \
1784 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1785 inline bool v_check_any(const _Tpvec& a) \
1786 { return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1787
1788OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
1789OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
1790OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
1791OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
1792OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
1793OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
1794OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
1795OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
1796
1797 #define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1798 inline bool v_check_all(const _Tpvec& a) \
1799 { \
1800 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1801 masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1802 masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1803 return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1804 } \
1805 inline bool v_check_any(const _Tpvec& a) \
1806 { \
1807 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1808 masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1809 masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1810 return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1811 } \
1812
1813OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
1814OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
1815
1816
1817 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1818 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1819 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1820 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1821 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1822 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1823 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1824 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1825 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1826 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1827
1828 #define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1829 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1830 { \
1831 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1832 }
1833
1834OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
1835OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
1836OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
1837OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
1838OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
1839OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
1840OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
1841OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
1842OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
1843OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
1844
1845 #define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1846 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1847 { \
1848 b0.val = intrin(a.val); \
1849 b1.val = __CV_CAT(intrin, _high)(a.val); \
1850 } \
1851 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1852 { return _Tpwvec(intrin(a.val)); } \
1853 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1854 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1855 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1856 { \
1857 v128_t a = wasm_v128_load(ptr); \
1858 return _Tpwvec(intrin(a)); \
1859 }
1860
1861OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
1862OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16, v_int16x8, schar, v128_cvti8x16_i16x8)
1863OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
1864OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8, v_int32x4, short, v128_cvti16x8_i32x4)
1865OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
1866OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4, v_int64x2, int, v128_cvti32x4_i64x2)
1867
1868 #define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \
1869 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1870 { \
1871 v128_t a = wasm_v128_load(ptr); \
1872 return _Tpvec(intrin(a)); \
1873 }
1874
1875OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1876OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
1877
1878 #define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1879 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1880 { \
1881 b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1882 b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1883 } \
1884 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1885 { \
1886 return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1887 } \
1888 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1889 { \
1890 return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1891 } \
1892 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1893 { \
1894 c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1895 d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1896 }
1897
1898OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
1899OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
1900OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
1901OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
1902OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
1903OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
1904OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
1905OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
1906
1907 template<int s, typename _Tpvec>
1908 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1909{
1910 return v_rotate_right<s>(a, b);
1911}
1912
1913 inline v_int32x4 v_round(const v_float32x4& a)
1914{
1915 v128_t h = wasm_f32x4_splat(0.5);
1916 return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1917}
1918
1919 inline v_int32x4 v_floor(const v_float32x4& a)
1920{
1921 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1922 v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1923 return v_int32x4(wasm_i32x4_add(a1, mask));
1924}
1925
1926 inline v_int32x4 v_ceil(const v_float32x4& a)
1927{
1928 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1929 v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1930 return v_int32x4(wasm_i32x4_sub(a1, mask));
1931}
1932
1933 inline v_int32x4 v_trunc(const v_float32x4& a)
1934{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1935
1936 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1937 inline v_int32x4 func(const v_float64x2& a) \
1938 { \
1939 double a_[2]; \
1940 wasm_v128_store(a_, a.val); \
1941 int c_[4]; \
1942 c_[0] = cfunc(a_[0]); \
1943 c_[1] = cfunc(a_[1]); \
1944 c_[2] = 0; \
1945 c_[3] = 0; \
1946 return v_int32x4(wasm_v128_load(c_)); \
1947 }
1948
1949OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
1950OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
1951OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
1952OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
1953
1954 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1955{
1956 double a_[2], b_[2];
1957 wasm_v128_store(a_, a.val);
1958 wasm_v128_store(b_, b.val);
1959 int c_[4];
1960 c_[0] = cvRound(a_[0]);
1961 c_[1] = cvRound(a_[1]);
1962 c_[2] = cvRound(b_[0]);
1963 c_[3] = cvRound(b_[1]);
1964 return v_int32x4(wasm_v128_load(c_));
1965}
1966
1967 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1968 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1969 const _Tpvec& a2, const _Tpvec& a3, \
1970 _Tpvec& b0, _Tpvec& b1, \
1971 _Tpvec& b2, _Tpvec& b3) \
1972 { \
1973 v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1974 v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1975 v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1976 v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1977 \
1978 b0.val = wasm_unpacklo_i64x2(t0, t1); \
1979 b1.val = wasm_unpackhi_i64x2(t0, t1); \
1980 b2.val = wasm_unpacklo_i64x2(t2, t3); \
1981 b3.val = wasm_unpackhi_i64x2(t2, t3); \
1982 }
1983
1984OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
1985OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
1986OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
1987
1988 // load deinterleave
1989 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1990{
1991 v128_t t00 = wasm_v128_load(ptr);
1992 v128_t t01 = wasm_v128_load(ptr + 16);
1993
1994 a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1995 b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
1996}
1997
1998 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1999{
2000 v128_t t00 = wasm_v128_load(ptr);
2001 v128_t t01 = wasm_v128_load(ptr + 16);
2002 v128_t t02 = wasm_v128_load(ptr + 32);
2003
2004 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2005 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2006 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2007
2008 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2009 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2010 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2011}
2012
2013 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2014{
2015 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2016 v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2017 v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
2018 v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
2019
2020 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2021 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2023 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2024
2025 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2026 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2027 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2028 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2029}
2030
2031 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2032{
2033 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 a2 b2 a3 b3
2034 v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
2035
2036 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
2037 b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
2038}
2039
2040 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2041{
2042 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 b1 c1 a2 b2
2043 v128_t t01 = wasm_v128_load(ptr + 8); // c2 a3 b3 c3 a4 b4 c4 a5
2044 v128_t t02 = wasm_v128_load(ptr + 16); // b5 c5 a6 b6 c6 a7 b7 c7
2045
2046 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2047 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2048 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2049
2050 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2051 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2052 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2053}
2054
2055 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2056{
2057 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2058 v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
2059 v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2060 v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
2061
2062 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
2063 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
2064 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
2065 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
2066
2067 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2068 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2069 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2070 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2071}
2072
2073 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2074{
2075 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2076 v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
2077
2078 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2079 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2080}
2081
2082 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2083{
2084 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2085 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2086 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2087
2088 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2089 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2090 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2091
2092 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2093 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2094 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2095}
2096
2097 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2098{
2099 v_uint32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2100 v_uint32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2101 v_uint32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2102 v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2103
2104 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2105}
2106
2107 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2108{
2109 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2110 v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
2111
2112 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2113 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2114}
2115
2116 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2117{
2118 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2119 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2120 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2121
2122 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2123 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2124 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2125
2126 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2127 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2128 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2129}
2130
2131 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2132{
2133 v_float32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2134 v_float32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2135 v_float32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2136 v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2137
2138 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2139}
2140
2141 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2142{
2143 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2144 v128_t t1 = wasm_v128_load(ptr + 2); // a1 b1
2145
2146 a.val = wasm_unpacklo_i64x2(t0, t1);
2147 b.val = wasm_unpackhi_i64x2(t0, t1);
2148}
2149
2150 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2151{
2152 v128_t t0 = wasm_v128_load(ptr); // a0, b0
2153 v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
2154 v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
2155
2156 a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2157 b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2158 c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2159}
2160
2161 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2162 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2163{
2164 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2165 v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
2166 v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
2167 v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
2168
2169 a.val = wasm_unpacklo_i64x2(t0, t2);
2170 b.val = wasm_unpackhi_i64x2(t0, t2);
2171 c.val = wasm_unpacklo_i64x2(t1, t3);
2172 d.val = wasm_unpackhi_i64x2(t1, t3);
2173}
2174
2175 // store interleave
2176
2177 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2178 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2179{
2180 v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2181 v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2182
2183 wasm_v128_store(ptr, v0);
2184 wasm_v128_store(ptr + 16, v1);
2185}
2186
2187 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2188 const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2189{
2190 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2191 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2192 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2193
2194 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2195 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2196 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2197
2198 wasm_v128_store(ptr, t10);
2199 wasm_v128_store(ptr + 16, t11);
2200 wasm_v128_store(ptr + 32, t12);
2201}
2202
2203 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2204 const v_uint8x16& c, const v_uint8x16& d,
2205 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2206{
2207 // a0 a1 a2 a3 ....
2208 // b0 b1 b2 b3 ....
2209 // c0 c1 c2 c3 ....
2210 // d0 d1 d2 d3 ....
2211 v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
2212 v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
2213 v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
2214 v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
2215
2216 v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
2217 v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
2218 v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
2219 v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
2220
2221 wasm_v128_store(ptr, v0);
2222 wasm_v128_store(ptr + 16, v1);
2223 wasm_v128_store(ptr + 32, v2);
2224 wasm_v128_store(ptr + 48, v3);
2225}
2226
2227 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2228 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2229{
2230 v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2231 v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2232
2233 wasm_v128_store(ptr, v0);
2234 wasm_v128_store(ptr + 8, v1);
2235}
2236
2237 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2238 const v_uint16x8& b, const v_uint16x8& c,
2239 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2240{
2241 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2242 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2243 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2244
2245 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2246 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2247 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2248
2249 wasm_v128_store(ptr, t10);
2250 wasm_v128_store(ptr + 8, t11);
2251 wasm_v128_store(ptr + 16, t12);
2252}
2253
2254 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2255 const v_uint16x8& c, const v_uint16x8& d,
2256 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2257{
2258 // a0 a1 a2 a3 ....
2259 // b0 b1 b2 b3 ....
2260 // c0 c1 c2 c3 ....
2261 // d0 d1 d2 d3 ....
2262 v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
2263 v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
2264 v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
2265 v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
2266
2267 v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
2268 v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
2269 v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
2270 v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
2271
2272 wasm_v128_store(ptr, v0);
2273 wasm_v128_store(ptr + 8, v1);
2274 wasm_v128_store(ptr + 16, v2);
2275 wasm_v128_store(ptr + 24, v3);
2276}
2277
2278 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2279 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2280{
2281 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2282 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2283
2284 wasm_v128_store(ptr, v0);
2285 wasm_v128_store(ptr + 4, v1);
2286}
2287
2288 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2289 const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2290{
2291 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2292 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2293 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2294
2295 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2296 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2297 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2298
2299 wasm_v128_store(ptr, t10);
2300 wasm_v128_store(ptr + 4, t11);
2301 wasm_v128_store(ptr + 8, t12);
2302}
2303
2304 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2305 const v_uint32x4& c, const v_uint32x4& d,
2306 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2307{
2308 v_uint32x4 v0, v1, v2, v3;
2309 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2310
2311 wasm_v128_store(ptr, v0.val);
2312 wasm_v128_store(ptr + 4, v1.val);
2313 wasm_v128_store(ptr + 8, v2.val);
2314 wasm_v128_store(ptr + 12, v3.val);
2315}
2316
2317 // 2-channel, float only
2318 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2319 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2320{
2321 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2322 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2323
2324 wasm_v128_store(ptr, v0);
2325 wasm_v128_store(ptr + 4, v1);
2326}
2327
2328 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2329 const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2330{
2331 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2332 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2333 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2334
2335 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2336 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2337 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2338
2339 wasm_v128_store(ptr, t10);
2340 wasm_v128_store(ptr + 4, t11);
2341 wasm_v128_store(ptr + 8, t12);
2342}
2343
2344 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2345 const v_float32x4& c, const v_float32x4& d,
2346 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2347{
2348 v_float32x4 v0, v1, v2, v3;
2349 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2350
2351 wasm_v128_store(ptr, v0.val);
2352 wasm_v128_store(ptr + 4, v1.val);
2353 wasm_v128_store(ptr + 8, v2.val);
2354 wasm_v128_store(ptr + 12, v3.val);
2355}
2356
2357 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2358 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2359{
2360 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2361 v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2362
2363 wasm_v128_store(ptr, v0);
2364 wasm_v128_store(ptr + 2, v1);
2365}
2366
2367 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2368 const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2369{
2370 v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2371 v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2372 v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2373
2374 wasm_v128_store(ptr, v0);
2375 wasm_v128_store(ptr + 2, v1);
2376 wasm_v128_store(ptr + 4, v2);
2377}
2378
2379 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2380 const v_uint64x2& c, const v_uint64x2& d,
2381 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2382{
2383 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2384 v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2385 v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2386 v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2387
2388 wasm_v128_store(ptr, v0);
2389 wasm_v128_store(ptr + 2, v1);
2390 wasm_v128_store(ptr + 4, v2);
2391 wasm_v128_store(ptr + 6, v3);
2392}
2393
2394 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2395 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2396 { \
2397 _Tpvec1 a1, b1; \
2398 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2399 a0 = v_reinterpret_as_##suffix0(a1); \
2400 b0 = v_reinterpret_as_##suffix0(b1); \
2401 } \
2402 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2403 { \
2404 _Tpvec1 a1, b1, c1; \
2405 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2406 a0 = v_reinterpret_as_##suffix0(a1); \
2407 b0 = v_reinterpret_as_##suffix0(b1); \
2408 c0 = v_reinterpret_as_##suffix0(c1); \
2409 } \
2410 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2411 { \
2412 _Tpvec1 a1, b1, c1, d1; \
2413 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2414 a0 = v_reinterpret_as_##suffix0(a1); \
2415 b0 = v_reinterpret_as_##suffix0(b1); \
2416 c0 = v_reinterpret_as_##suffix0(c1); \
2417 d0 = v_reinterpret_as_##suffix0(d1); \
2418 } \
2419 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2420 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2421 { \
2422 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2423 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2424 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2425 } \
2426 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2427 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2428 { \
2429 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2430 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2431 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2432 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2433 } \
2434 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2435 const _Tpvec0& c0, const _Tpvec0& d0, \
2436 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2437 { \
2438 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2439 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2440 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2441 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2442 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2443 }
2444
2445OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2446OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2447OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2448OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2449OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2450
2451 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2452{
2453 return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2454}
2455
2456 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2457{
2458 double a_[2];
2459 wasm_v128_store(a_, a.val);
2460 float c_[4];
2461 c_[0] = (float)(a_[0]);
2462 c_[1] = (float)(a_[1]);
2463 c_[2] = 0;
2464 c_[3] = 0;
2465 return v_float32x4(wasm_v128_load(c_));
2466}
2467
2468 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2469{
2470 double a_[2], b_[2];
2471 wasm_v128_store(a_, a.val);
2472 wasm_v128_store(b_, b.val);
2473 float c_[4];
2474 c_[0] = (float)(a_[0]);
2475 c_[1] = (float)(a_[1]);
2476 c_[2] = (float)(b_[0]);
2477 c_[3] = (float)(b_[1]);
2478 return v_float32x4(wasm_v128_load(c_));
2479}
2480
2481 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2482{
2483 #ifdef __wasm_unimplemented_simd128__
2484 v128_t p = v128_cvti32x4_i64x2(a.val);
2485 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2486 #else
2487 int a_[4];
2488 wasm_v128_store(a_, a.val);
2489 double c_[2];
2490 c_[0] = (double)(a_[0]);
2491 c_[1] = (double)(a_[1]);
2492 return v_float64x2(wasm_v128_load(c_));
2493 #endif
2494}
2495
2496 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2497{
2498 #ifdef __wasm_unimplemented_simd128__
2499 v128_t p = v128_cvti32x4_i64x2_high(a.val);
2500 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2501 #else
2502 int a_[4];
2503 wasm_v128_store(a_, a.val);
2504 double c_[2];
2505 c_[0] = (double)(a_[2]);
2506 c_[1] = (double)(a_[3]);
2507 return v_float64x2(wasm_v128_load(c_));
2508 #endif
2509}
2510
2511 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2512{
2513 float a_[4];
2514 wasm_v128_store(a_, a.val);
2515 double c_[2];
2516 c_[0] = (double)(a_[0]);
2517 c_[1] = (double)(a_[1]);
2518 return v_float64x2(wasm_v128_load(c_));
2519}
2520
2521 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2522{
2523 float a_[4];
2524 wasm_v128_store(a_, a.val);
2525 double c_[2];
2526 c_[0] = (double)(a_[2]);
2527 c_[1] = (double)(a_[3]);
2528 return v_float64x2(wasm_v128_load(c_));
2529}
2530
2531 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2532{
2533 #ifdef __wasm_unimplemented_simd128__
2534 return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2535 #else
2536 int64 a_[2];
2537 wasm_v128_store(a_, a.val);
2538 double c_[2];
2539 c_[0] = (double)(a_[0]);
2540 c_[1] = (double)(a_[1]);
2541 return v_float64x2(wasm_v128_load(c_));
2542 #endif
2543}
2544
2546
2547 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2548{
2549 return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2550 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2551}
2552 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2553{
2554 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2555 tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2556}
2557 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2558{
2559 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2560 tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2561}
2562 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
2563 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
2564 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
2565
2566 inline v_int16x8 v_lut(const short* tab, const int* idx)
2567{
2568 return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2569 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2570}
2571 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2572{
2573 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2574 tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2575}
2576 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2577{
2578 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2579 tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2580}
2581 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
2582 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
2583 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
2584
2585 inline v_int32x4 v_lut(const int* tab, const int* idx)
2586{
2587 return v_int32x4(tab[idx[0]], tab[idx[1]],
2588 tab[idx[2]], tab[idx[3]]);
2589}
2590 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2591{
2592 return v_int32x4(tab[idx[0]], tab[idx[0]+1],
2593 tab[idx[1]], tab[idx[1]+1]);
2594}
2595 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2596{
2597 return v_int32x4(wasm_v128_load(tab + idx[0]));
2598}
2599 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
2600 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
2601 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
2602
2603 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2604{
2605 return v_int64x2(tab[idx[0]], tab[idx[1]]);
2606}
2607 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2608{
2609 return v_int64x2(wasm_v128_load(tab + idx[0]));
2610}
2611 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2612 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2613
2614 inline v_float32x4 v_lut(const float* tab, const int* idx)
2615{
2616 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2617}
2618 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
2619 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
2620
2621 inline v_float64x2 v_lut(const double* tab, const int* idx)
2622{
2623 return v_float64x2(tab[idx[0]], tab[idx[1]]);
2624}
2625 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2626{
2627 return v_float64x2(wasm_v128_load(tab + idx[0]));
2628}
2629
2630 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2631{
2632 return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2633 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2634 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2635 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2636}
2637
2638 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2639{
2640 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
2641}
2642
2643 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2644{
2645 return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2646 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2647 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2648 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2649}
2650
2651 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2652{
2653 return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2654 tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2655}
2656
2657 // loads pairs from the table and deinterleaves them, e.g. returns:
2658 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2659 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2660 // note that the indices are float's indices, not the float-pair indices.
2661 // in theory, this function can be used to implement bilinear interpolation,
2662 // when idxvec are the offsets within the image.
2663 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2664{
2665 x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2666 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2667 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2668 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2669 y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2670 tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2671 tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2672 tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2673}
2674
2675 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2676{
2677 v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2678 v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2679 x.val = wasm_unpacklo_i64x2(xy0, xy1);
2680 y.val = wasm_unpacklo_i64x2(xy0, xy1);
2681}
2682
2683 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2684{
2685 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2686}
2687 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2688 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2689{
2690 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2691}
2692 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2693
2694 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2695{
2696 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2697}
2698 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2699 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2700{
2701 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2702}
2703 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2704
2705 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2706{
2707 return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2708}
2709 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2710 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
2711{
2712 return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2713}
2714
2715 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2716{
2717 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2718}
2719 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2720
2721 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2722{
2723 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2724}
2725 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2726
2727 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2728 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2729 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2730
2731 template<int i, typename _Tp>
2732 inline typename _Tp::lane_type v_extract_n(const _Tp& a)
2733{
2734 return v_rotate_right<i>(a).get0();
2735}
2736
2737 template<int i>
2739{
2740 return v_setall_u32(v_extract_n<i>(a));
2741}
2742 template<int i>
2743 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
2744{
2745 return v_setall_s32(v_extract_n<i>(a));
2746}
2747 template<int i>
2749{
2750 return v_setall_f32(v_extract_n<i>(a));
2751}
2752
2753
2755
2756 inline v_float32x4 v_load_expand(const float16_t* ptr)
2757{
2758 float a[4];
2759 for (int i = 0; i < 4; i++)
2760 a[i] = ptr[i];
2761 return v_float32x4(wasm_v128_load(a));
2762}
2763
2764 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2765{
2766 double v_[4];
2767 wasm_v128_store(v_, v.val);
2768 ptr[0] = float16_t(v_[0]);
2769 ptr[1] = float16_t(v_[1]);
2770 ptr[2] = float16_t(v_[2]);
2771 ptr[3] = float16_t(v_[3]);
2772}
2773
2774 inline void v_cleanup() {}
2775
2776CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2777
2779
2780}
2781
2782 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2118
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix
Definition: intrin_cpp.hpp:2764
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract
Definition: intrin_cpp.hpp:2400
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition: intrin_cpp.hpp:2374
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2046
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer
Definition: fast_math.hpp:200
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition: fast_math.hpp:254
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition: fast_math.hpp:234
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75