OpenCV 4.5.3(日本語機械翻訳)
intrin_neon.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50
51 namespace cv
52{
53
55
56CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57
58 #define CV_SIMD128 1
59 #if defined(__aarch64__) || defined(_M_ARM64)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64
65 // The following macro checks if the code is being compiled for the
66 // AArch64 execution state of Armv8, to enable the 128-bit
67 // intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68 // the Arm C Language Extension (ACLE) specifications [1] to check the
69 // availability of 128-bit intrinsics, and it is supporrted by clang
70 // and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71 // Visual Studio [2] .
72 //
73 // [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74 // [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76 #define CV_NEON_AARCH64 1
77 #else
78 #define CV_NEON_AARCH64 0
79 #endif
80
81 // TODO
82 #define CV_NEON_DOT 0
83
85
86 #if CV_SIMD128_64F
87 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
88 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
89 { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
90 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
91 inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
92 { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
93 #else
94 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
95 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
96 { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
97 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
98 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
99 { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
100 #endif
101
102 #if CV_SIMD128_64F
103 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
104 template <typename T> static inline \
105 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
106 template <typename T> static inline \
107 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
108 #else
109 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
110 #endif
111
112 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
113 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
114 OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
115 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
116
117 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
118 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
119
120 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
121 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
122
123OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
124OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
125OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
126OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
127OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
128OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
129OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
130OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
131OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
132 #if CV_SIMD128_64F
133OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
134 #endif
135
137
138 struct v_uint8x16
139{
140 typedef uchar lane_type;
141 enum { nlanes = 16 };
142
143 v_uint8x16() {}
144 explicit v_uint8x16(uint8x16_t v) : val(v) {}
145 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
146 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
147 {
148 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
149 val = vld1q_u8(v);
150 }
151 uchar get0() const
152 {
153 return vgetq_lane_u8(val, 0);
154 }
155
156 uint8x16_t val;
157};
158
159 struct v_int8x16
160{
161 typedef schar lane_type;
162 enum { nlanes = 16 };
163
164 v_int8x16() {}
165 explicit v_int8x16(int8x16_t v) : val(v) {}
166 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
167 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
168 {
169 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
170 val = vld1q_s8(v);
171 }
172 schar get0() const
173 {
174 return vgetq_lane_s8(val, 0);
175 }
176
177 int8x16_t val;
178};
179
180 struct v_uint16x8
181{
182 typedef ushort lane_type;
183 enum { nlanes = 8 };
184
185 v_uint16x8() {}
186 explicit v_uint16x8(uint16x8_t v) : val(v) {}
187 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
188 {
189 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
190 val = vld1q_u16(v);
191 }
192 ushort get0() const
193 {
194 return vgetq_lane_u16(val, 0);
195 }
196
197 uint16x8_t val;
198};
199
200 struct v_int16x8
201{
202 typedef short lane_type;
203 enum { nlanes = 8 };
204
205 v_int16x8() {}
206 explicit v_int16x8(int16x8_t v) : val(v) {}
207 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
208 {
209 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
210 val = vld1q_s16(v);
211 }
212 short get0() const
213 {
214 return vgetq_lane_s16(val, 0);
215 }
216
217 int16x8_t val;
218};
219
220 struct v_uint32x4
221{
222 typedef unsigned lane_type;
223 enum { nlanes = 4 };
224
225 v_uint32x4() {}
226 explicit v_uint32x4(uint32x4_t v) : val(v) {}
227 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
228 {
229 unsigned v[] = {v0, v1, v2, v3};
230 val = vld1q_u32(v);
231 }
232 unsigned get0() const
233 {
234 return vgetq_lane_u32(val, 0);
235 }
236
237 uint32x4_t val;
238};
239
240 struct v_int32x4
241{
242 typedef int lane_type;
243 enum { nlanes = 4 };
244
245 v_int32x4() {}
246 explicit v_int32x4(int32x4_t v) : val(v) {}
247 v_int32x4(int v0, int v1, int v2, int v3)
248 {
249 int v[] = {v0, v1, v2, v3};
250 val = vld1q_s32(v);
251 }
252 int get0() const
253 {
254 return vgetq_lane_s32(val, 0);
255 }
256 int32x4_t val;
257};
258
259 struct v_float32x4
260{
261 typedef float lane_type;
262 enum { nlanes = 4 };
263
264 v_float32x4() {}
265 explicit v_float32x4(float32x4_t v) : val(v) {}
266 v_float32x4(float v0, float v1, float v2, float v3)
267 {
268 float v[] = {v0, v1, v2, v3};
269 val = vld1q_f32(v);
270 }
271 float get0() const
272 {
273 return vgetq_lane_f32(val, 0);
274 }
275 float32x4_t val;
276};
277
278 struct v_uint64x2
279{
280 typedef uint64 lane_type;
281 enum { nlanes = 2 };
282
283 v_uint64x2() {}
284 explicit v_uint64x2(uint64x2_t v) : val(v) {}
285 v_uint64x2(uint64 v0, uint64 v1)
286 {
287 uint64 v[] = {v0, v1};
288 val = vld1q_u64(v);
289 }
290 uint64 get0() const
291 {
292 return vgetq_lane_u64(val, 0);
293 }
294 uint64x2_t val;
295};
296
297 struct v_int64x2
298{
299 typedef int64 lane_type;
300 enum { nlanes = 2 };
301
302 v_int64x2() {}
303 explicit v_int64x2(int64x2_t v) : val(v) {}
304 v_int64x2(int64 v0, int64 v1)
305 {
306 int64 v[] = {v0, v1};
307 val = vld1q_s64(v);
308 }
309 int64 get0() const
310 {
311 return vgetq_lane_s64(val, 0);
312 }
313 int64x2_t val;
314};
315
316 #if CV_SIMD128_64F
317 struct v_float64x2
318{
319 typedef double lane_type;
320 enum { nlanes = 2 };
321
322 v_float64x2() {}
323 explicit v_float64x2(float64x2_t v) : val(v) {}
324 v_float64x2(double v0, double v1)
325 {
326 double v[] = {v0, v1};
327 val = vld1q_f64(v);
328 }
329 double get0() const
330 {
331 return vgetq_lane_f64(val, 0);
332 }
333 float64x2_t val;
334};
335 #endif
336
337 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
338 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
339 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
340 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
341 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
342 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
343 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
344 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
345 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
346 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
347 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
348 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
349 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
350
351OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
352OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
353OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
354OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
355OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
356OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
357OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
358OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
359OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
360 #if CV_SIMD128_64F
361 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
362 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
363OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
364OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
365OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
366OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
367OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
368OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
369OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
370OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
371OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
372OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
373OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
374 #endif
375
376 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
377 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
378 { \
379 hreg a1 = mov(a.val), b1 = mov(b.val); \
380 return _Tpvec(vcombine_##suffix(a1, b1)); \
381 } \
382 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
383 { \
384 hreg a1 = mov(a.val); \
385 vst1_##suffix(ptr, a1); \
386 } \
387 template<int n> inline \
388 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
389 { \
390 hreg a1 = rshr(a.val, n); \
391 hreg b1 = rshr(b.val, n); \
392 return _Tpvec(vcombine_##suffix(a1, b1)); \
393 } \
394 template<int n> inline \
395 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
396 { \
397 hreg a1 = rshr(a.val, n); \
398 vst1_##suffix(ptr, a1); \
399 }
400
401OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
402OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
403OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
404OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
405OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
406OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
407
408OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
409OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
410
411 // pack boolean
412 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
413{
414 uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
415 return v_uint8x16(ab);
416}
417
418 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
419 const v_uint32x4& c, const v_uint32x4& d)
420{
421 uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
422 uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
423 return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
424}
425
426 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
427 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
428 const v_uint64x2& g, const v_uint64x2& h)
429{
430 uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
431 uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
432 uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
433 uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
434
435 uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
436 uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
437 return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
438}
439
440 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
441 const v_float32x4& m1, const v_float32x4& m2,
442 const v_float32x4& m3)
443{
444 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
445 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
446 res = vmlaq_lane_f32(res, m1.val, vl, 1);
447 res = vmlaq_lane_f32(res, m2.val, vh, 0);
448 res = vmlaq_lane_f32(res, m3.val, vh, 1);
449 return v_float32x4(res);
450}
451
452 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
453 const v_float32x4& m1, const v_float32x4& m2,
454 const v_float32x4& a)
455{
456 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
457 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
458 res = vmlaq_lane_f32(res, m1.val, vl, 1);
459 res = vmlaq_lane_f32(res, m2.val, vh, 0);
460 res = vaddq_f32(res, a.val);
461 return v_float32x4(res);
462}
463
464 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
465 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
466 { \
467 return _Tpvec(intrin(a.val, b.val)); \
468 } \
469 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
470 { \
471 a.val = intrin(a.val, b.val); \
472 return a; \
473 }
474
475OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
476OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
477OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
478OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
479OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
480OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
481OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
482OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
483OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
484OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
485OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
486OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
487OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
488OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
489OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
490OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
491OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
492OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
493OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
494OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
495OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
496 #if CV_SIMD128_64F
497OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
498OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
499OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
500OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
501OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
502 #else
503 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
504{
505 float32x4_t reciprocal = vrecpeq_f32(b.val);
506 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
507 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
508 return v_float32x4(vmulq_f32(a.val, reciprocal));
509}
510 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
511{
512 float32x4_t reciprocal = vrecpeq_f32(b.val);
513 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
514 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
515 a.val = vmulq_f32(a.val, reciprocal);
516 return a;
517}
518 #endif
519
520 // saturating multiply 8-bit, 16-bit
521 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
522 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
523 { \
524 _Tpwvec c, d; \
525 v_mul_expand(a, b, c, d); \
526 return v_pack(c, d); \
527 } \
528 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
529 { a = a * b; return a; }
530
531OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
532OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
533OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
534OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
535
536 // Multiply and expand
537 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
538 v_int16x8& c, v_int16x8& d)
539{
540 c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
541 #if CV_NEON_AARCH64
542 d.val = vmull_high_s8(a.val, b.val);
543 #else // #if CV_NEON_AARCH64
544 d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
545 #endif // #if CV_NEON_AARCH64
546}
547
548 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
549 v_uint16x8& c, v_uint16x8& d)
550{
551 c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
552 #if CV_NEON_AARCH64
553 d.val = vmull_high_u8(a.val, b.val);
554 #else // #if CV_NEON_AARCH64
555 d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
556 #endif // #if CV_NEON_AARCH64
557}
558
559 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
560 v_int32x4& c, v_int32x4& d)
561{
562 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
563 #if CV_NEON_AARCH64
564 d.val = vmull_high_s16(a.val, b.val);
565 #else // #if CV_NEON_AARCH64
566 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
567 #endif // #if CV_NEON_AARCH64
568}
569
570 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
571 v_uint32x4& c, v_uint32x4& d)
572{
573 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
574 #if CV_NEON_AARCH64
575 d.val = vmull_high_u16(a.val, b.val);
576 #else // #if CV_NEON_AARCH64
577 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
578 #endif // #if CV_NEON_AARCH64
579}
580
581 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
582 v_uint64x2& c, v_uint64x2& d)
583{
584 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
585 #if CV_NEON_AARCH64
586 d.val = vmull_high_u32(a.val, b.val);
587 #else // #if CV_NEON_AARCH64
588 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
589 #endif // #if CV_NEON_AARCH64
590}
591
592 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
593{
594 return v_int16x8(vcombine_s16(
595 vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
596 vshrn_n_s32(
597#if CV_NEON_AARCH64
598 vmull_high_s16(a.val, b.val)
599#else // #if CV_NEON_AARCH64
600 vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
601#endif // #if CV_NEON_AARCH64
602 , 16)
603 ));
604}
605 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
606{
607 return v_uint16x8(vcombine_u16(
608 vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
609 vshrn_n_u32(
610#if CV_NEON_AARCH64
611 vmull_high_u16(a.val, b.val)
612#else // #if CV_NEON_AARCH64
613 vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
614#endif // #if CV_NEON_AARCH64
615 , 16)
616 ));
617}
618
620
621 // 16 >> 32
622 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
623{
624 int16x8_t uzp1, uzp2;
625 _v128_unzip(a.val, b.val, uzp1, uzp2);
626 int16x4_t a0 = vget_low_s16(uzp1);
627 int16x4_t b0 = vget_high_s16(uzp1);
628 int16x4_t a1 = vget_low_s16(uzp2);
629 int16x4_t b1 = vget_high_s16(uzp2);
630 int32x4_t p = vmull_s16(a0, b0);
631 return v_int32x4(vmlal_s16(p, a1, b1));
632}
633 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
634{
635 int16x8_t uzp1, uzp2;
636 _v128_unzip(a.val, b.val, uzp1, uzp2);
637 int16x4_t a0 = vget_low_s16(uzp1);
638 int16x4_t b0 = vget_high_s16(uzp1);
639 int16x4_t a1 = vget_low_s16(uzp2);
640 int16x4_t b1 = vget_high_s16(uzp2);
641 int32x4_t p = vmlal_s16(c.val, a0, b0);
642 return v_int32x4(vmlal_s16(p, a1, b1));
643}
644
645 // 32 >> 64
646 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
647{
648 int32x4_t uzp1, uzp2;
649 _v128_unzip(a.val, b.val, uzp1, uzp2);
650 int32x2_t a0 = vget_low_s32(uzp1);
651 int32x2_t b0 = vget_high_s32(uzp1);
652 int32x2_t a1 = vget_low_s32(uzp2);
653 int32x2_t b1 = vget_high_s32(uzp2);
654 int64x2_t p = vmull_s32(a0, b0);
655 return v_int64x2(vmlal_s32(p, a1, b1));
656}
657 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
658{
659 int32x4_t uzp1, uzp2;
660 _v128_unzip(a.val, b.val, uzp1, uzp2);
661 int32x2_t a0 = vget_low_s32(uzp1);
662 int32x2_t b0 = vget_high_s32(uzp1);
663 int32x2_t a1 = vget_low_s32(uzp2);
664 int32x2_t b1 = vget_high_s32(uzp2);
665 int64x2_t p = vmlal_s32(c.val, a0, b0);
666 return v_int64x2(vmlal_s32(p, a1, b1));
667}
668
669 // 8 >> 32
670 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
671{
672 #if CV_NEON_DOT
673 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
674 #else
675 const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
676 const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
677 const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
678 const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
679
680 uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
681 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
682 uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
683 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
684
685 uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
686 vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
687 uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
688 vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
689 return v_uint32x4(vaddq_u32(s0, s1));
690 #endif
691}
692 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
693 const v_uint32x4& c)
694{
695 #if CV_NEON_DOT
696 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
697 #else
698 return v_dotprod_expand(a, b) + c;
699 #endif
700}
701
702 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
703{
704 #if CV_NEON_DOT
705 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
706 #else
707 int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
708 int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
709 int16x8_t uzp1, uzp2;
710 _v128_unzip(p0, p1, uzp1, uzp2);
711 int16x8_t sum = vaddq_s16(uzp1, uzp2);
712 int16x4_t uzpl1, uzpl2;
713 _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
714 return v_int32x4(vaddl_s16(uzpl1, uzpl2));
715 #endif
716}
717 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
718 const v_int32x4& c)
719{
720 #if CV_NEON_DOT
721 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
722 #else
723 return v_dotprod_expand(a, b) + c;
724 #endif
725}
726
727 // 16 >> 64
728 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
729{
730 const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
731 const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
732
733 uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
734 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
735 uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
736 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
737 uint32x4_t uzp1, uzp2;
738 _v128_unzip(even, odd, uzp1, uzp2);
739 uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
740 uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
741 return v_uint64x2(vaddq_u64(s0, s1));
742}
743 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
744{ return v_dotprod_expand(a, b) + c; }
745
746 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
747{
748 int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
749 int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
750
751 int32x4_t uzp1, uzp2;
752 _v128_unzip(p0, p1, uzp1, uzp2);
753 int32x4_t sum = vaddq_s32(uzp1, uzp2);
754
755 int32x2_t uzpl1, uzpl2;
756 _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
757 return v_int64x2(vaddl_s32(uzpl1, uzpl2));
758}
759 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
760 const v_int64x2& c)
761{ return v_dotprod_expand(a, b) + c; }
762
763 // 32 >> 64f
764 #if CV_SIMD128_64F
765 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
766{ return v_cvt_f64(v_dotprod(a, b)); }
767 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
768 const v_float64x2& c)
769{ return v_dotprod_expand(a, b) + c; }
770 #endif
771
773
774 // 16 >> 32
775 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
776{
777 #if CV_NEON_AARCH64
778 int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
779 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
780 #else
781 int16x4_t a0 = vget_low_s16(a.val);
782 int16x4_t a1 = vget_high_s16(a.val);
783 int16x4_t b0 = vget_low_s16(b.val);
784 int16x4_t b1 = vget_high_s16(b.val);
785 int32x4_t p = vmull_s16(a0, b0);
786 return v_int32x4(vmlal_s16(p, a1, b1));
787 #endif
788}
789 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
790{
791 #if CV_NEON_AARCH64
792 int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
793 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
794 #else
795 int16x4_t a0 = vget_low_s16(a.val);
796 int16x4_t a1 = vget_high_s16(a.val);
797 int16x4_t b0 = vget_low_s16(b.val);
798 int16x4_t b1 = vget_high_s16(b.val);
799 int32x4_t p = vmlal_s16(c.val, a0, b0);
800 return v_int32x4(vmlal_s16(p, a1, b1));
801 #endif
802}
803
804 // 32 >> 64
805 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
806{
807 #if CV_NEON_AARCH64
808 int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
809 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
810 #else
811 int32x2_t a0 = vget_low_s32(a.val);
812 int32x2_t a1 = vget_high_s32(a.val);
813 int32x2_t b0 = vget_low_s32(b.val);
814 int32x2_t b1 = vget_high_s32(b.val);
815 int64x2_t p = vmull_s32(a0, b0);
816 return v_int64x2(vmlal_s32(p, a1, b1));
817 #endif
818}
819 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
820{
821 #if CV_NEON_AARCH64
822 int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
823 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
824 #else
825 int32x2_t a0 = vget_low_s32(a.val);
826 int32x2_t a1 = vget_high_s32(a.val);
827 int32x2_t b0 = vget_low_s32(b.val);
828 int32x2_t b1 = vget_high_s32(b.val);
829 int64x2_t p = vmlal_s32(c.val, a0, b0);
830 return v_int64x2(vmlal_s32(p, a1, b1));
831 #endif
832}
833
834 // 8 >> 32
835 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
836{
837 #if CV_NEON_DOT
838 return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
839 #else
840 uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
841 uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
842 uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
843 uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
844 return v_uint32x4(vaddq_u32(s0, s1));
845 #endif
846}
847 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
848{
849 #if CV_NEON_DOT
850 return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
851 #else
852 return v_dotprod_expand_fast(a, b) + c;
853 #endif
854}
855
856 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
857{
858 #if CV_NEON_DOT
859 return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
860 #else
861 int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
862 prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
863 return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
864 #endif
865}
866 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
867{
868 #if CV_NEON_DOT
869 return v_int32x4(vdotq_s32(c.val, a.val, b.val));
870 #else
871 return v_dotprod_expand_fast(a, b) + c;
872 #endif
873}
874
875 // 16 >> 64
876 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
877{
878 uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
879 uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
880 uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
881 uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
882 return v_uint64x2(vaddq_u64(s0, s1));
883}
884 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
885{ return v_dotprod_expand_fast(a, b) + c; }
886
887 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
888{
889 int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
890 prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
891 return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
892}
893 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
894{ return v_dotprod_expand_fast(a, b) + c; }
895
896 // 32 >> 64f
897 #if CV_SIMD128_64F
898 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
899{ return v_cvt_f64(v_dotprod_fast(a, b)); }
900 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
901{ return v_dotprod_expand_fast(a, b) + c; }
902 #endif
903
904
905 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
906 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
907 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
908 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
909 inline _Tpvec operator ~ (const _Tpvec& a) \
910 { \
911 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
912 }
913
914OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
915OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
916OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
917OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
918OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
919OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
920OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
921OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
922
923 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
924 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
925 { \
926 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
927 } \
928 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
929 { \
930 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
931 return a; \
932 }
933
934OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
935OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
936OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
937
938 inline v_float32x4 operator ~ (const v_float32x4& a)
939{
940 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
941}
942
943 #if CV_SIMD128_64F
944 inline v_float32x4 v_sqrt(const v_float32x4& x)
945{
946 return v_float32x4(vsqrtq_f32(x.val));
947}
948
949 inline v_float32x4 v_invsqrt(const v_float32x4& x)
950{
951 v_float32x4 one = v_setall_f32(1.0f);
952 return one / v_sqrt(x);
953}
954 #else
955 inline v_float32x4 v_sqrt(const v_float32x4& x)
956{
957 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
958 float32x4_t e = vrsqrteq_f32(x1);
959 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
960 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
961 return v_float32x4(vmulq_f32(x.val, e));
962}
963
964 inline v_float32x4 v_invsqrt(const v_float32x4& x)
965{
966 float32x4_t e = vrsqrteq_f32(x.val);
967 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
968 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
969 return v_float32x4(e);
970}
971 #endif
972
973 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
974 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
975
976OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
977OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
978OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
979
980 inline v_float32x4 v_abs(v_float32x4 x)
981{ return v_float32x4(vabsq_f32(x.val)); }
982
983 #if CV_SIMD128_64F
984 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
985 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
986 { \
987 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
988 } \
989 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
990 { \
991 a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
992 return a; \
993 }
994
995OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
996OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
997OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
998
999 inline v_float64x2 operator ~ (const v_float64x2& a)
1000{
1001 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1002}
1003
1004 inline v_float64x2 v_sqrt(const v_float64x2& x)
1005{
1006 return v_float64x2(vsqrtq_f64(x.val));
1007}
1008
1009 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1010{
1011 v_float64x2 one = v_setall_f64(1.0f);
1012 return one / v_sqrt(x);
1013}
1014
1015 inline v_float64x2 v_abs(v_float64x2 x)
1016{ return v_float64x2(vabsq_f64(x.val)); }
1017 #endif
1018
1019 // TODO: exp, log, sin, cos
1020
1021 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1022 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1023 { \
1024 return _Tpvec(intrin(a.val, b.val)); \
1025 }
1026
1027OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
1028OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
1029OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
1030OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
1031OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
1032OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
1033OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
1034OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
1035OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
1036OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
1037OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
1038OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
1039OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
1040OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
1041 #if CV_SIMD128_64F
1042OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
1043OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
1044 #endif
1045
1046 #if CV_SIMD128_64F
1047 inline int64x2_t vmvnq_s64(int64x2_t a)
1048{
1049 int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
1050 return veorq_s64(a, vx);
1051}
1052 inline uint64x2_t vmvnq_u64(uint64x2_t a)
1053{
1054 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1055 return veorq_u64(a, vx);
1056}
1057 #endif
1058 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1059 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1060 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1061 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1062 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1063 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1064 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1065 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1066 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1067 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1068 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1069 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1070 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1071
1072OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1073OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1074OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1075OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1076OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1077OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1078OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1079 #if CV_SIMD128_64F
1080OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1081OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1082OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1083 #endif
1084
1085 inline v_float32x4 v_not_nan(const v_float32x4& a)
1086{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1087 #if CV_SIMD128_64F
1088 inline v_float64x2 v_not_nan(const v_float64x2& a)
1089{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1090 #endif
1091
1092OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1093OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1094OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1095OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1096OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1097OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1098OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1099OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1100OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1101OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1102OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1103OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1104
1105OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1106OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1107OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1108OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1109 #if CV_SIMD128_64F
1110OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1111 #endif
1112
1114 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1115{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1116 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1117{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1118
1119 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1120 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1121 { \
1122 return _Tpvec2(cast(intrin(a.val, b.val))); \
1123 }
1124
1125OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1126OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1127OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1128
1129 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1130{
1131 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1132 return v_sqrt(x);
1133}
1134
1135 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1136{
1137 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1138}
1139
1140 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1141{
1142 #if CV_SIMD128_64F
1143 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1144 // also adds FMA support both for single- and double-precision floating-point vectors
1145 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1146 #else
1147 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1148 #endif
1149}
1150
1151 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1152{
1153 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1154}
1155
1156 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1157{
1158 return v_fma(a, b, c);
1159}
1160
1161 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1162{
1163 return v_fma(a, b, c);
1164}
1165
1166 #if CV_SIMD128_64F
1167 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1168{
1169 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1170 return v_sqrt(x);
1171}
1172
1173 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1174{
1175 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1176}
1177
1178 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1179{
1180 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1181}
1182
1183 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1184{
1185 return v_fma(a, b, c);
1186}
1187 #endif
1188
1189 // trade efficiency for convenience
1190 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1191 inline _Tpvec operator << (const _Tpvec& a, int n) \
1192 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1193 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1194 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1195 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1196 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1197 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1198 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1199 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1200 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1201
1202OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1203OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1204OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1205OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1206OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1207OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1208OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1209OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1210
1211 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1212 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1213 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1214 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1215 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
1216 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1217 { return a; } \
1218 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1219 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1220 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1221 { return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
1222 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1223 { CV_UNUSED(b); return a; }
1224
1225OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1226OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1227OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1228OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1229OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1230OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1231OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1232OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1233OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1234 #if CV_SIMD128_64F
1235OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1236 #endif
1237
1238 #if defined(__clang__) && defined(__aarch64__)
1239 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1240 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1241 inline _Tpvec v_load_low(const _Tp* ptr) \
1242 { \
1243 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1244 uint64 v = *(unaligned_uint64*)ptr; \
1245 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1246 }
1247 #else
1248 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1249 inline _Tpvec v_load_low(const _Tp* ptr) \
1250 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1251 #endif
1252
1253 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1254 inline _Tpvec v_load(const _Tp* ptr) \
1255 { return _Tpvec(vld1q_##suffix(ptr)); } \
1256 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1257 { return _Tpvec(vld1q_##suffix(ptr)); } \
1258 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1259 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1260 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1261 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1262 { vst1q_##suffix(ptr, a.val); } \
1263 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1264 { vst1q_##suffix(ptr, a.val); } \
1265 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1266 { vst1q_##suffix(ptr, a.val); } \
1267 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/ ) \
1268 { vst1q_##suffix(ptr, a.val); } \
1269 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1270 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1271 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1272 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1273
1274OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1275OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1276OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1277OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1278OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1279OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1280OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1281OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1282OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1283 #if CV_SIMD128_64F
1284OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1285 #endif
1286
1287 inline unsigned v_reduce_sum(const v_uint8x16& a)
1288{
1289 #if CV_NEON_AARCH64
1290 uint16_t t0 = vaddlvq_u8(a.val);
1291 return t0;
1292 #else // #if CV_NEON_AARCH64
1293 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1294 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1295 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1296 #endif // #if CV_NEON_AARCH64
1297}
1298 inline int v_reduce_sum(const v_int8x16& a)
1299{
1300 #if CV_NEON_AARCH64
1301 int16_t t0 = vaddlvq_s8(a.val);
1302 return t0;
1303 #else // #if CV_NEON_AARCH64
1304 int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1305 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1306 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1307 #endif // #if CV_NEON_AARCH64
1308}
1309 inline unsigned v_reduce_sum(const v_uint16x8& a)
1310{
1311 #if CV_NEON_AARCH64
1312 uint32_t t0 = vaddlvq_u16(a.val);
1313 return t0;
1314 #else // #if CV_NEON_AARCH64
1315 uint32x4_t t0 = vpaddlq_u16(a.val);
1316 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1317 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1318 #endif // #if CV_NEON_AARCH64
1319}
1320 inline int v_reduce_sum(const v_int16x8& a)
1321{
1322 #if CV_NEON_AARCH64
1323 int32_t t0 = vaddlvq_s16(a.val);
1324 return t0;
1325 #else // #if CV_NEON_AARCH64
1326 int32x4_t t0 = vpaddlq_s16(a.val);
1327 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1328 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1329 #endif // #if CV_NEON_AARCH64
1330}
1331
1332 #if CV_NEON_AARCH64
1333 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1334 inline scalartype v_reduce_##func(const _Tpvec& a) \
1335 { \
1336 return v##vectorfunc##vq_##suffix(a.val); \
1337 }
1338 #else // #if CV_NEON_AARCH64
1339 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1340 inline scalartype v_reduce_##func(const _Tpvec& a) \
1341 { \
1342 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1343 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1344 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1345 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1346 }
1347 #endif // #if CV_NEON_AARCH64
1348
1349OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
1350OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
1351OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
1352OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
1353
1354 #if CV_NEON_AARCH64
1355 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1356 inline scalartype v_reduce_##func(const _Tpvec& a) \
1357 { \
1358 return v##vectorfunc##vq_##suffix(a.val); \
1359 }
1360 #else // #if CV_NEON_AARCH64
1361 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1362 inline scalartype v_reduce_##func(const _Tpvec& a) \
1363 { \
1364 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1365 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1366 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1367 }
1368 #endif // #if CV_NEON_AARCH64
1369
1370OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
1371OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
1372OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
1373OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
1374
1375 #if CV_NEON_AARCH64
1376 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1377 inline scalartype v_reduce_##func(const _Tpvec& a) \
1378 { \
1379 return v##vectorfunc##vq_##suffix(a.val); \
1380 }
1381 #else // #if CV_NEON_AARCH64
1382 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1383 inline scalartype v_reduce_##func(const _Tpvec& a) \
1384 { \
1385 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1386 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1387 }
1388 #endif // #if CV_NEON_AARCH64
1389
1390OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1391OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1392OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1393OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1394OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1395OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1396OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1397OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1398OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1399
1400 inline uint64 v_reduce_sum(const v_uint64x2& a)
1401{
1402 #if CV_NEON_AARCH64
1403 return vaddvq_u64(a.val);
1404 #else // #if CV_NEON_AARCH64
1405 return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1406 #endif // #if CV_NEON_AARCH64
1407}
1408 inline int64 v_reduce_sum(const v_int64x2& a)
1409{
1410 #if CV_NEON_AARCH64
1411 return vaddvq_s64(a.val);
1412 #else // #if CV_NEON_AARCH64
1413 return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1414 #endif // #if CV_NEON_AARCH64
1415}
1416 #if CV_SIMD128_64F
1417 inline double v_reduce_sum(const v_float64x2& a)
1418{
1419 return vaddvq_f64(a.val);
1420}
1421 #endif
1422
1423 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1424 const v_float32x4& c, const v_float32x4& d)
1425{
1426 #if CV_NEON_AARCH64
1427 float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
1428 float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
1429 return v_float32x4(vpaddq_f32(ab, cd)); // sumA sumB sumC sumD
1430 #else // #if CV_NEON_AARCH64
1431 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1432 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1433
1434 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1435 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1436
1437 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1438 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1439
1440 return v_float32x4(vaddq_f32(v0, v1));
1441 #endif // #if CV_NEON_AARCH64
1442}
1443
1444 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1445{
1446 #if CV_NEON_AARCH64
1447 uint8x16_t t0 = vabdq_u8(a.val, b.val);
1448 uint16_t t1 = vaddlvq_u8(t0);
1449 return t1;
1450 #else // #if CV_NEON_AARCH64
1451 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1452 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1453 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1454 #endif // #if CV_NEON_AARCH64
1455}
1456 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1457{
1458 #if CV_NEON_AARCH64
1459 uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1460 uint16_t t1 = vaddlvq_u8(t0);
1461 return t1;
1462 #else // #if CV_NEON_AARCH64
1463 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1464 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1465 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1466 #endif // #if CV_NEON_AARCH64
1467}
1468 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1469{
1470 #if CV_NEON_AARCH64
1471 uint16x8_t t0 = vabdq_u16(a.val, b.val);
1472 uint32_t t1 = vaddlvq_u16(t0);
1473 return t1;
1474 #else // #if CV_NEON_AARCH64
1475 uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1476 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1477 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1478 #endif // #if CV_NEON_AARCH64
1479}
1480 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1481{
1482 #if CV_NEON_AARCH64
1483 uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1484 uint32_t t1 = vaddlvq_u16(t0);
1485 return t1;
1486 #else // #if CV_NEON_AARCH64
1487 uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1488 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1489 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1490 #endif // #if CV_NEON_AARCH64
1491}
1492 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1493{
1494 #if CV_NEON_AARCH64
1495 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1496 uint32_t t1 = vaddvq_u32(t0);
1497 return t1;
1498 #else // #if CV_NEON_AARCH64
1499 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1500 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1501 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1502 #endif // #if CV_NEON_AARCH64
1503}
1504 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1505{
1506 #if CV_NEON_AARCH64
1507 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1508 uint32_t t1 = vaddvq_u32(t0);
1509 return t1;
1510 #else // #if CV_NEON_AARCH64
1511 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1512 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1513 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1514 #endif // #if CV_NEON_AARCH64
1515}
1516 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1517{
1518 #if CV_NEON_AARCH64
1519 float32x4_t t0 = vabdq_f32(a.val, b.val);
1520 return vaddvq_f32(t0);
1521 #else // #if CV_NEON_AARCH64
1522 float32x4_t t0 = vabdq_f32(a.val, b.val);
1523 float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1524 return vget_lane_f32(vpadd_f32(t1, t1), 0);
1525 #endif // #if CV_NEON_AARCH64
1526}
1527
1528 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1529{ return v_uint8x16(vcntq_u8(a.val)); }
1530 inline v_uint8x16 v_popcount(const v_int8x16& a)
1531{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1532 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1533{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1534 inline v_uint16x8 v_popcount(const v_int16x8& a)
1535{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1536 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1537{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1538 inline v_uint32x4 v_popcount(const v_int32x4& a)
1539{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1540 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1541{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1542 inline v_uint64x2 v_popcount(const v_int64x2& a)
1543{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1544
1545 inline int v_signmask(const v_uint8x16& a)
1546{
1547 #if CV_NEON_AARCH64
1548 const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1549 const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1550 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1551 uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1552 uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1553 return t0;
1554 #else // #if CV_NEON_AARCH64
1555 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1556 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1557 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1558 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1559 #endif // #if CV_NEON_AARCH64
1560}
1561
1562 inline int v_signmask(const v_int8x16& a)
1563{ return v_signmask(v_reinterpret_as_u8(a)); }
1564
1565 inline int v_signmask(const v_uint16x8& a)
1566{
1567 #if CV_NEON_AARCH64
1568 const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1569 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1570 uint32_t t0 = vaddlvq_u16(v0);
1571 return t0;
1572 #else // #if CV_NEON_AARCH64
1573 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1574 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1575 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1576 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1577 #endif // #if CV_NEON_AARCH64
1578}
1579 inline int v_signmask(const v_int16x8& a)
1580{ return v_signmask(v_reinterpret_as_u16(a)); }
1581
1582 inline int v_signmask(const v_uint32x4& a)
1583{
1584 #if CV_NEON_AARCH64
1585 const int32x4_t signPosition = {0,1,2,3};
1586 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1587 uint32_t t0 = vaddvq_u32(v0);
1588 return t0;
1589 #else // #if CV_NEON_AARCH64
1590 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1591 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1592 uint64x2_t v1 = vpaddlq_u32(v0);
1593 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1594 #endif // #if CV_NEON_AARCH64
1595}
1596 inline int v_signmask(const v_int32x4& a)
1597{ return v_signmask(v_reinterpret_as_u32(a)); }
1598 inline int v_signmask(const v_float32x4& a)
1599{ return v_signmask(v_reinterpret_as_u32(a)); }
1600 inline int v_signmask(const v_uint64x2& a)
1601{
1602 #if CV_NEON_AARCH64
1603 const int64x2_t signPosition = {0,1};
1604 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1605 uint64_t t0 = vaddvq_u64(v0);
1606 return t0;
1607 #else // #if CV_NEON_AARCH64
1608 int64x1_t m0 = vdup_n_s64(0);
1609 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1610 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1611 #endif // #if CV_NEON_AARCH64
1612}
1613 inline int v_signmask(const v_int64x2& a)
1614{ return v_signmask(v_reinterpret_as_u64(a)); }
1615 #if CV_SIMD128_64F
1616 inline int v_signmask(const v_float64x2& a)
1617{ return v_signmask(v_reinterpret_as_u64(a)); }
1618 #endif
1619
1620 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1621 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1622 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1623 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1624 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1625 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1626 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1627 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1628 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1629 #if CV_SIMD128_64F
1630 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1631 #endif
1632
1633 #if CV_NEON_AARCH64
1634 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1635 inline bool v_check_all(const v_##_Tpvec& a) \
1636 { \
1637 return (vminvq_##suffix(a.val) >> shift) != 0; \
1638 } \
1639 inline bool v_check_any(const v_##_Tpvec& a) \
1640 { \
1641 return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1642 }
1643 #else // #if CV_NEON_AARCH64
1644 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1645 inline bool v_check_all(const v_##_Tpvec& a) \
1646 { \
1647 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1648 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1649 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1650 } \
1651 inline bool v_check_any(const v_##_Tpvec& a) \
1652 { \
1653 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1654 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1655 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1656 }
1657 #endif // #if CV_NEON_AARCH64
1658
1659OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1660OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1661OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1662
1663inline bool v_check_all(const v_uint64x2& a)
1664{
1665 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1666 return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1667}
1668 inline bool v_check_any(const v_uint64x2& a)
1669{
1670 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1671 return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1672}
1673
1674 inline bool v_check_all(const v_int8x16& a)
1675{ return v_check_all(v_reinterpret_as_u8(a)); }
1676 inline bool v_check_all(const v_int16x8& a)
1677{ return v_check_all(v_reinterpret_as_u16(a)); }
1678 inline bool v_check_all(const v_int32x4& a)
1679{ return v_check_all(v_reinterpret_as_u32(a)); }
1680 inline bool v_check_all(const v_float32x4& a)
1681{ return v_check_all(v_reinterpret_as_u32(a)); }
1682
1683 inline bool v_check_any(const v_int8x16& a)
1684{ return v_check_any(v_reinterpret_as_u8(a)); }
1685 inline bool v_check_any(const v_int16x8& a)
1686{ return v_check_any(v_reinterpret_as_u16(a)); }
1687 inline bool v_check_any(const v_int32x4& a)
1688{ return v_check_any(v_reinterpret_as_u32(a)); }
1689 inline bool v_check_any(const v_float32x4& a)
1690{ return v_check_any(v_reinterpret_as_u32(a)); }
1691
1692 inline bool v_check_all(const v_int64x2& a)
1693{ return v_check_all(v_reinterpret_as_u64(a)); }
1694 inline bool v_check_any(const v_int64x2& a)
1695{ return v_check_any(v_reinterpret_as_u64(a)); }
1696 #if CV_SIMD128_64F
1697 inline bool v_check_all(const v_float64x2& a)
1698{ return v_check_all(v_reinterpret_as_u64(a)); }
1699 inline bool v_check_any(const v_float64x2& a)
1700{ return v_check_any(v_reinterpret_as_u64(a)); }
1701 #endif
1702
1703 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1704 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1705 { \
1706 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1707 }
1708
1709OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1710OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1711OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1712OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1713OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1714OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1715OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1716 #if CV_SIMD128_64F
1717OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1718 #endif
1719
1720 #if CV_NEON_AARCH64
1721 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1722 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1723 { \
1724 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1725 b1.val = vmovl_high_##suffix(a.val); \
1726 } \
1727 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1728 { \
1729 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1730 } \
1731 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1732 { \
1733 return _Tpwvec(vmovl_high_##suffix(a.val)); \
1734 } \
1735 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1736 { \
1737 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1738 }
1739 #else
1740 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1741 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1742 { \
1743 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1744 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1745 } \
1746 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1747 { \
1748 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1749 } \
1750 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1751 { \
1752 return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1753 } \
1754 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1755 { \
1756 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1757 }
1758 #endif
1759
1760OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1761OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1762OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1763OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1764OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1765OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1766
1767 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1768{
1769 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1770 uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1771 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1772 return v_uint32x4(vmovl_u16(v1));
1773}
1774
1775inline v_int32x4 v_load_expand_q(const schar* ptr)
1776{
1777 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1778 int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1779 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1780 return v_int32x4(vmovl_s16(v1));
1781}
1782
1783 #if defined(__aarch64__) || defined(_M_ARM64)
1784 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1785 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1786 { \
1787 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1788 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1789 } \
1790 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1791 { \
1792 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1793 } \
1794 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1795 { \
1796 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1797 } \
1798 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1799 { \
1800 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1801 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1802 }
1803 #else
1804 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1805 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1806 { \
1807 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1808 b0.val = p.val[0]; \
1809 b1.val = p.val[1]; \
1810 } \
1811 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1812 { \
1813 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1814 } \
1815 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1816 { \
1817 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1818 } \
1819 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1820 { \
1821 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1822 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1823 }
1824 #endif
1825
1826OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1827OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1828OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1829OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1830OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1831OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1832OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1833 #if CV_SIMD128_64F
1834OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1835 #endif
1836
1837 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1838{
1839 uint8x16_t vec = vrev64q_u8(a.val);
1840 return v_uint8x16(vextq_u8(vec, vec, 8));
1841}
1842
1843 inline v_int8x16 v_reverse(const v_int8x16 &a)
1844{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1845
1846 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1847{
1848 uint16x8_t vec = vrev64q_u16(a.val);
1849 return v_uint16x8(vextq_u16(vec, vec, 4));
1850}
1851
1852 inline v_int16x8 v_reverse(const v_int16x8 &a)
1853{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1854
1855 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1856{
1857 uint32x4_t vec = vrev64q_u32(a.val);
1858 return v_uint32x4(vextq_u32(vec, vec, 2));
1859}
1860
1861 inline v_int32x4 v_reverse(const v_int32x4 &a)
1862{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1863
1864 inline v_float32x4 v_reverse(const v_float32x4 &a)
1865{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1866
1867 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1868{
1869 uint64x2_t vec = a.val;
1870 uint64x1_t vec_lo = vget_low_u64(vec);
1871 uint64x1_t vec_hi = vget_high_u64(vec);
1872 return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1873}
1874
1875 inline v_int64x2 v_reverse(const v_int64x2 &a)
1876{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1877
1878 #if CV_SIMD128_64F
1879 inline v_float64x2 v_reverse(const v_float64x2 &a)
1880{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1881 #endif
1882
1883 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1884 template <int s> \
1885 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1886 { \
1887 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1888 }
1889
1890OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1891OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1892OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1893OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1894OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1895OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1896OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1897OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1898OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1899 #if CV_SIMD128_64F
1900OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1901 #endif
1902
1903 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1904 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1905
1906OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1907OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1908OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1909OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1910OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1911OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1912OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1913OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1914OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1915 #if CV_SIMD128_64F
1916OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1917 #endif
1918
1919 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1920 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1921
1922OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1923OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1924OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1925OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1926OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1927OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1928OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1929OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1930OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1931 #if CV_SIMD128_64F
1932OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1933 #endif
1934
1935 #if CV_SIMD128_64F
1936 inline v_int32x4 v_round(const v_float32x4& a)
1937{
1938 float32x4_t a_ = a.val;
1939 int32x4_t result;
1940 __asm__ ("fcvtns %0.4s, %1.4s"
1941 : "=w"(result)
1942 : "w"(a_)
1943 : /* No clobbers */);
1944 return v_int32x4(result);
1945}
1946 #else
1947 inline v_int32x4 v_round(const v_float32x4& a)
1948{
1949 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
1950 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
1951
1952 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
1953 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
1954}
1955 #endif
1956 inline v_int32x4 v_floor(const v_float32x4& a)
1957{
1958 int32x4_t a1 = vcvtq_s32_f32(a.val);
1959 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
1960 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
1961}
1962
1963 inline v_int32x4 v_ceil(const v_float32x4& a)
1964{
1965 int32x4_t a1 = vcvtq_s32_f32(a.val);
1966 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
1967 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
1968}
1969
1970 inline v_int32x4 v_trunc(const v_float32x4& a)
1971{ return v_int32x4(vcvtq_s32_f32(a.val)); }
1972
1973 #if CV_SIMD128_64F
1974 inline v_int32x4 v_round(const v_float64x2& a)
1975{
1976 static const int32x2_t zero = vdup_n_s32(0);
1977 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
1978}
1979
1980 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1981{
1982 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
1983}
1984
1985 inline v_int32x4 v_floor(const v_float64x2& a)
1986{
1987 static const int32x2_t zero = vdup_n_s32(0);
1988 int64x2_t a1 = vcvtq_s64_f64(a.val);
1989 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
1990 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
1991 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
1992}
1993
1994 inline v_int32x4 v_ceil(const v_float64x2& a)
1995{
1996 static const int32x2_t zero = vdup_n_s32(0);
1997 int64x2_t a1 = vcvtq_s64_f64(a.val);
1998 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
1999 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2000 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2001}
2002
2003 inline v_int32x4 v_trunc(const v_float64x2& a)
2004{
2005 static const int32x2_t zero = vdup_n_s32(0);
2006 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2007}
2008 #endif
2009
2010 #if CV_NEON_AARCH64
2011 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2012 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2013 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2014 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2015 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2016 { \
2017 /* -- Pass 1: 64b transpose */ \
2018 _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2019 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2020 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2021 _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2022 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2023 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2024 _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2025 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2026 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2027 _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2028 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2029 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2030 /* -- Pass 2: 32b transpose */ \
2031 b0.val = vtrn1q_##suffix##32(t0, t1); \
2032 b1.val = vtrn2q_##suffix##32(t0, t1); \
2033 b2.val = vtrn1q_##suffix##32(t2, t3); \
2034 b3.val = vtrn2q_##suffix##32(t2, t3); \
2035 }
2036
2037OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2038OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2039OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2040 #else // #if CV_NEON_AARCH64
2041 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2042 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2043 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2044 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2045 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2046 { \
2047 /* m00 m01 m02 m03 */ \
2048 /* m10 m11 m12 m13 */ \
2049 /* m20 m21 m22 m23 */ \
2050 /* m30 m31 m32 m33 */ \
2051 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2052 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2053 /* m00 m10 m02 m12 */ \
2054 /* m01 m11 m03 m13 */ \
2055 /* m20 m30 m22 m32 */ \
2056 /* m21 m31 m23 m33 */ \
2057 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2058 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2059 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2060 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2061 }
2062
2063OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2064OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2065OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2066 #endif // #if CV_NEON_AARCH64
2067
2068 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2069 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2070 { \
2071 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2072 a.val = v.val[0]; \
2073 b.val = v.val[1]; \
2074 } \
2075 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2076 { \
2077 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2078 a.val = v.val[0]; \
2079 b.val = v.val[1]; \
2080 c.val = v.val[2]; \
2081 } \
2082 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2083 v_##_Tpvec& c, v_##_Tpvec& d) \
2084 { \
2085 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2086 a.val = v.val[0]; \
2087 b.val = v.val[1]; \
2088 c.val = v.val[2]; \
2089 d.val = v.val[3]; \
2090 } \
2091 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2092 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2093 { \
2094 _Tpvec##x2_t v; \
2095 v.val[0] = a.val; \
2096 v.val[1] = b.val; \
2097 vst2q_##suffix(ptr, v); \
2098 } \
2099 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2100 const v_##_Tpvec& c, hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2101 { \
2102 _Tpvec##x3_t v; \
2103 v.val[0] = a.val; \
2104 v.val[1] = b.val; \
2105 v.val[2] = c.val; \
2106 vst3q_##suffix(ptr, v); \
2107 } \
2108 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2109 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2110 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED ) \
2111 { \
2112 _Tpvec##x4_t v; \
2113 v.val[0] = a.val; \
2114 v.val[1] = b.val; \
2115 v.val[2] = c.val; \
2116 v.val[3] = d.val; \
2117 vst4q_##suffix(ptr, v); \
2118 }
2119
2120 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2121 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2122 { \
2123 tp##x1_t a0 = vld1_##suffix(ptr); \
2124 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2125 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2126 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2127 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2128 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2129 } \
2130 \
2131 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2132 v_##tp##x2& b, v_##tp##x2& c ) \
2133 { \
2134 tp##x1_t a0 = vld1_##suffix(ptr); \
2135 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2136 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2137 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2138 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2139 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2140 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2141 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2142 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2143 } \
2144 \
2145 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2146 v_##tp##x2& c, v_##tp##x2& d ) \
2147 { \
2148 tp##x1_t a0 = vld1_##suffix(ptr); \
2149 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2150 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2151 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2152 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2153 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2154 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2155 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2156 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2157 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2158 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2159 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2160 } \
2161 \
2162 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2163 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2164 { \
2165 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2166 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2167 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2168 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2169 } \
2170 \
2171 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2172 const v_##tp##x2& b, const v_##tp##x2& c, \
2173 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2174 { \
2175 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2176 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2177 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2178 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2179 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2180 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2181 } \
2182 \
2183 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2184 const v_##tp##x2& c, const v_##tp##x2& d, \
2185 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
2186 { \
2187 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2188 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2189 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2190 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2191 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2192 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2193 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2194 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2195 }
2196
2197OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2198OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2199OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2200OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
2201OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
2202OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
2203OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
2204 #if CV_SIMD128_64F
2205OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
2206 #endif
2207
2208OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2209OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2210
2211 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2212{
2213 return v_float32x4(vcvtq_f32_s32(a.val));
2214}
2215
2216 #if CV_SIMD128_64F
2217 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2218{
2219 float32x2_t zero = vdup_n_f32(0.0f);
2220 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2221}
2222
2223 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2224{
2225 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2226}
2227
2228 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2229{
2230 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2231}
2232
2233 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2234{
2235 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2236}
2237
2238 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2239{
2240 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2241}
2242
2243 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2244{
2245 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2246}
2247
2248 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2249{ return v_float64x2(vcvtq_f64_s64(a.val)); }
2250
2251 #endif
2252
2254
2255 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2256{
2257 schar CV_DECL_ALIGNED(32) elems[16] =
2258 {
2259 tab[idx[ 0]],
2260 tab[idx[ 1]],
2261 tab[idx[ 2]],
2262 tab[idx[ 3]],
2263 tab[idx[ 4]],
2264 tab[idx[ 5]],
2265 tab[idx[ 6]],
2266 tab[idx[ 7]],
2267 tab[idx[ 8]],
2268 tab[idx[ 9]],
2269 tab[idx[10]],
2270 tab[idx[11]],
2271 tab[idx[12]],
2272 tab[idx[13]],
2273 tab[idx[14]],
2274 tab[idx[15]]
2275 };
2276 return v_int8x16(vld1q_s8(elems));
2277}
2278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2279{
2280 schar CV_DECL_ALIGNED(32) elems[16] =
2281 {
2282 tab[idx[0]],
2283 tab[idx[0] + 1],
2284 tab[idx[1]],
2285 tab[idx[1] + 1],
2286 tab[idx[2]],
2287 tab[idx[2] + 1],
2288 tab[idx[3]],
2289 tab[idx[3] + 1],
2290 tab[idx[4]],
2291 tab[idx[4] + 1],
2292 tab[idx[5]],
2293 tab[idx[5] + 1],
2294 tab[idx[6]],
2295 tab[idx[6] + 1],
2296 tab[idx[7]],
2297 tab[idx[7] + 1]
2298 };
2299 return v_int8x16(vld1q_s8(elems));
2300}
2301 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2302{
2303 schar CV_DECL_ALIGNED(32) elems[16] =
2304 {
2305 tab[idx[0]],
2306 tab[idx[0] + 1],
2307 tab[idx[0] + 2],
2308 tab[idx[0] + 3],
2309 tab[idx[1]],
2310 tab[idx[1] + 1],
2311 tab[idx[1] + 2],
2312 tab[idx[1] + 3],
2313 tab[idx[2]],
2314 tab[idx[2] + 1],
2315 tab[idx[2] + 2],
2316 tab[idx[2] + 3],
2317 tab[idx[3]],
2318 tab[idx[3] + 1],
2319 tab[idx[3] + 2],
2320 tab[idx[3] + 3]
2321 };
2322 return v_int8x16(vld1q_s8(elems));
2323}
2324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2327
2328 inline v_int16x8 v_lut(const short* tab, const int* idx)
2329{
2330 short CV_DECL_ALIGNED(32) elems[8] =
2331 {
2332 tab[idx[0]],
2333 tab[idx[1]],
2334 tab[idx[2]],
2335 tab[idx[3]],
2336 tab[idx[4]],
2337 tab[idx[5]],
2338 tab[idx[6]],
2339 tab[idx[7]]
2340 };
2341 return v_int16x8(vld1q_s16(elems));
2342}
2343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2344{
2345 short CV_DECL_ALIGNED(32) elems[8] =
2346 {
2347 tab[idx[0]],
2348 tab[idx[0] + 1],
2349 tab[idx[1]],
2350 tab[idx[1] + 1],
2351 tab[idx[2]],
2352 tab[idx[2] + 1],
2353 tab[idx[3]],
2354 tab[idx[3] + 1]
2355 };
2356 return v_int16x8(vld1q_s16(elems));
2357}
2358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2359{
2360 return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2361}
2362 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
2363 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
2364 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2365
2366 inline v_int32x4 v_lut(const int* tab, const int* idx)
2367{
2368 int CV_DECL_ALIGNED(32) elems[4] =
2369 {
2370 tab[idx[0]],
2371 tab[idx[1]],
2372 tab[idx[2]],
2373 tab[idx[3]]
2374 };
2375 return v_int32x4(vld1q_s32(elems));
2376}
2377 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2378{
2379 return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2380}
2381 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2382{
2383 return v_int32x4(vld1q_s32(tab + idx[0]));
2384}
2385 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
2386 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
2387 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2388
2389 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2390{
2391 return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2392}
2393 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2394{
2395 return v_int64x2(vld1q_s64(tab + idx[0]));
2396}
2397 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2398 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2399
2400 inline v_float32x4 v_lut(const float* tab, const int* idx)
2401{
2402 float CV_DECL_ALIGNED(32) elems[4] =
2403 {
2404 tab[idx[0]],
2405 tab[idx[1]],
2406 tab[idx[2]],
2407 tab[idx[3]]
2408 };
2409 return v_float32x4(vld1q_f32(elems));
2410}
2411 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2412{
2413 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2414
2415 uint64 CV_DECL_ALIGNED(32) elems[2] =
2416 {
2417 *(unaligned_uint64*)(tab + idx[0]),
2418 *(unaligned_uint64*)(tab + idx[1])
2419 };
2420 return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2421}
2422 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2423{
2424 return v_float32x4(vld1q_f32(tab + idx[0]));
2425}
2426
2427 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2428{
2429 int CV_DECL_ALIGNED(32) elems[4] =
2430 {
2431 tab[vgetq_lane_s32(idxvec.val, 0)],
2432 tab[vgetq_lane_s32(idxvec.val, 1)],
2433 tab[vgetq_lane_s32(idxvec.val, 2)],
2434 tab[vgetq_lane_s32(idxvec.val, 3)]
2435 };
2436 return v_int32x4(vld1q_s32(elems));
2437}
2438
2439 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2440{
2441 unsigned CV_DECL_ALIGNED(32) elems[4] =
2442 {
2443 tab[vgetq_lane_s32(idxvec.val, 0)],
2444 tab[vgetq_lane_s32(idxvec.val, 1)],
2445 tab[vgetq_lane_s32(idxvec.val, 2)],
2446 tab[vgetq_lane_s32(idxvec.val, 3)]
2447 };
2448 return v_uint32x4(vld1q_u32(elems));
2449}
2450
2451 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2452{
2453 float CV_DECL_ALIGNED(32) elems[4] =
2454 {
2455 tab[vgetq_lane_s32(idxvec.val, 0)],
2456 tab[vgetq_lane_s32(idxvec.val, 1)],
2457 tab[vgetq_lane_s32(idxvec.val, 2)],
2458 tab[vgetq_lane_s32(idxvec.val, 3)]
2459 };
2460 return v_float32x4(vld1q_f32(elems));
2461}
2462
2463 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2464{
2465 /*int CV_DECL_ALIGNED(32) idx[4];
2466 v_store(idx, idxvec);
2467
2468 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2469 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2470
2471 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2472 x = v_float32x4(xxyy.val[0]);
2473 y = v_float32x4(xxyy.val[1]);*/
2474 int CV_DECL_ALIGNED(32) idx[4];
2475 v_store_aligned(idx, idxvec);
2476
2477 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2478 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2479}
2480
2481inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2482{
2483 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2484}
2485 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2486 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2487{
2488 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2489}
2490 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2491
2492 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2493{
2494 return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2495}
2496 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2497 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2498{
2499 int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2500 return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2501}
2502 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2503
2504 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2505{
2506 int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2507 return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2508}
2509 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2510 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2511
2512 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2513{
2514 return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2515}
2516 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2517
2518 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2519{
2520 return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2521}
2522 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2523
2524 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2525 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2526 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2527
2528 #if CV_SIMD128_64F
2529 inline v_float64x2 v_lut(const double* tab, const int* idx)
2530{
2531 double CV_DECL_ALIGNED(32) elems[2] =
2532 {
2533 tab[idx[0]],
2534 tab[idx[1]]
2535 };
2536 return v_float64x2(vld1q_f64(elems));
2537}
2538
2539 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2540{
2541 return v_float64x2(vld1q_f64(tab + idx[0]));
2542}
2543
2544 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2545{
2546 double CV_DECL_ALIGNED(32) elems[2] =
2547 {
2548 tab[vgetq_lane_s32(idxvec.val, 0)],
2549 tab[vgetq_lane_s32(idxvec.val, 1)],
2550 };
2551 return v_float64x2(vld1q_f64(elems));
2552}
2553
2554 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2555{
2556 int CV_DECL_ALIGNED(32) idx[4];
2557 v_store_aligned(idx, idxvec);
2558
2559 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2560 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2561}
2562 #endif
2563
2565 #if CV_FP16
2566 inline v_float32x4 v_load_expand(const float16_t* ptr)
2567{
2568 float16x4_t v =
2569 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2570 (float16x4_t)vld1_s16((const short*)ptr);
2571 #else
2572 vld1_f16((const __fp16*)ptr);
2573 #endif
2574 return v_float32x4(vcvt_f32_f16(v));
2575}
2576
2577 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2578{
2579 float16x4_t hv = vcvt_f16_f32(v.val);
2580
2581 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2582 vst1_s16((short*)ptr, (int16x4_t)hv);
2583 #else
2584 vst1_f16((__fp16*)ptr, hv);
2585 #endif
2586}
2587 #else
2588 inline v_float32x4 v_load_expand(const float16_t* ptr)
2589{
2590 const int N = 4;
2591 float buf[N];
2592 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2593 return v_load(buf);
2594}
2595
2596 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2597{
2598 const int N = 4;
2599 float buf[N];
2600 v_store(buf, v);
2601 for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2602}
2603 #endif
2604
2605 inline void v_cleanup() {}
2606
2607CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2608
2610
2611}
2612
2613 #endif
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory
Definition: intrin_cpp.hpp:2193
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude
Definition: intrin_cpp.hpp:1036
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
CV_INLINE v_reg< _Tp, n > operator/(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Divide values
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude
Definition: intrin_cpp.hpp:1023
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add
Definition: intrin_cpp.hpp:1049
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2254
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75