OpenCV 4.5.3(日本語機械翻訳)
intrin_vsx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5 #ifndef OPENCV_HAL_VSX_HPP
6 #define OPENCV_HAL_VSX_HPP
7
8 #include <algorithm>
9 #include "opencv2/core/utility.hpp"
10
11 #define CV_SIMD128 1
12 #define CV_SIMD128_64F 1
13
14 namespace cv
15{
16
18
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
22
23 struct v_uint8x16
24{
25 typedef uchar lane_type;
26 enum { nlanes = 16 };
27 vec_uchar16 val;
28
29 explicit v_uint8x16(const vec_uchar16& v) : val(v)
30 {}
31 v_uint8x16()
32 {}
33 v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
34 {}
35 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
36 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
37 : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
38 {}
39
40 static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
41
42 uchar get0() const
43 { return vec_extract(val, 0); }
44};
45
46 struct v_int8x16
47{
48 typedef schar lane_type;
49 enum { nlanes = 16 };
50 vec_char16 val;
51
52 explicit v_int8x16(const vec_char16& v) : val(v)
53 {}
54 v_int8x16()
55 {}
56 v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
57 {}
58 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
59 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
60 : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
61 {}
62
63 static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
64
65 schar get0() const
66 { return vec_extract(val, 0); }
67};
68
69 struct v_uint16x8
70{
71 typedef ushort lane_type;
72 enum { nlanes = 8 };
73 vec_ushort8 val;
74
75 explicit v_uint16x8(const vec_ushort8& v) : val(v)
76 {}
77 v_uint16x8()
78 {}
79 v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
80 {}
81 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
82 : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
83 {}
84
85 static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
86
87 ushort get0() const
88 { return vec_extract(val, 0); }
89};
90
91 struct v_int16x8
92{
93 typedef short lane_type;
94 enum { nlanes = 8 };
95 vec_short8 val;
96
97 explicit v_int16x8(const vec_short8& v) : val(v)
98 {}
99 v_int16x8()
100 {}
101 v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
102 {}
103 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
104 : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
105 {}
106
107 static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
108
109 short get0() const
110 { return vec_extract(val, 0); }
111};
112
113 struct v_uint32x4
114{
115 typedef unsigned lane_type;
116 enum { nlanes = 4 };
117 vec_uint4 val;
118
119 explicit v_uint32x4(const vec_uint4& v) : val(v)
120 {}
121 v_uint32x4()
122 {}
123 v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
124 {}
125 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
126 {}
127
128 static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
129
130 uint get0() const
131 { return vec_extract(val, 0); }
132};
133
134 struct v_int32x4
135{
136 typedef int lane_type;
137 enum { nlanes = 4 };
138 vec_int4 val;
139
140 explicit v_int32x4(const vec_int4& v) : val(v)
141 {}
142 v_int32x4()
143 {}
144 v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
145 {}
146 v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
147 {}
148
149 static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
150
151 int get0() const
152 { return vec_extract(val, 0); }
153};
154
155 struct v_float32x4
156{
157 typedef float lane_type;
158 enum { nlanes = 4 };
159 vec_float4 val;
160
161 explicit v_float32x4(const vec_float4& v) : val(v)
162 {}
163 v_float32x4()
164 {}
165 v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
166 {}
167 v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
168 {}
169
170 static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
171
172 float get0() const
173 { return vec_extract(val, 0); }
174};
175
176 struct v_uint64x2
177{
178 typedef uint64 lane_type;
179 enum { nlanes = 2 };
180 vec_udword2 val;
181
182 explicit v_uint64x2(const vec_udword2& v) : val(v)
183 {}
184 v_uint64x2()
185 {}
186 v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
187 {}
188 v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
189 {}
190
191 static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
192
193 uint64 get0() const
194 { return vec_extract(val, 0); }
195};
196
197 struct v_int64x2
198{
199 typedef int64 lane_type;
200 enum { nlanes = 2 };
201 vec_dword2 val;
202
203 explicit v_int64x2(const vec_dword2& v) : val(v)
204 {}
205 v_int64x2()
206 {}
207 v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
208 {}
209 v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
210 {}
211
212 static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
213
214 int64 get0() const
215 { return vec_extract(val, 0); }
216};
217
218 struct v_float64x2
219{
220 typedef double lane_type;
221 enum { nlanes = 2 };
222 vec_double2 val;
223
224 explicit v_float64x2(const vec_double2& v) : val(v)
225 {}
226 v_float64x2()
227 {}
228 v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
229 {}
230 v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
231 {}
232
233 static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
234
235 double get0() const
236 { return vec_extract(val, 0); }
237};
238
239 #define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
240 template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
241
242OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
243OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
244OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
245OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
246OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
247OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
248OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
249OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
250OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
251OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
252
253
254
255 /*
256 * clang-5 aborted during parse "vec_xxx_c" only if it's
257 * inside a function template which is defined by preprocessor macro.
258 *
259 * if vec_xxx_c defined as C++ cast, clang-5 will pass it
260 */
261 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
262 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
263 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
264 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
265 { return _Tpvec((cast)a.val); }
266
267OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
268OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
269OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
270OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
271OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
272OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
273OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
274OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
275OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
276OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
277
278 #define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
279 inline _Tpvec v_load(const _Tp* ptr) \
280 { return _Tpvec(ld(0, ptr)); } \
281 inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
282 { return _Tpvec(ld_a(0, ptr)); } \
283 inline _Tpvec v_load_low(const _Tp* ptr) \
284 { return _Tpvec(vec_ld_l8(ptr)); } \
285 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
286 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
287 inline void v_store(_Tp* ptr, const _Tpvec& a) \
288 { st(a.val, 0, ptr); } \
289 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
290 { st_a(a.val, 0, ptr); } \
291 inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
292 { st_a(a.val, 0, ptr); } \
293 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
294 { if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
295 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
296 { vec_st_l8(a.val, ptr); } \
297 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
298 { vec_st_h8(a.val, ptr); }
299
300 // working around gcc bug for aligned ld/st
301 // if runtime check for vec_ld/st fail we failback to unaligned ld/st
302 // https://github.com/opencv/opencv/issues/13211
303 #ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
304 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
305 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
306 #else
307 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
308 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
309 #endif
310
311OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16, uchar)
312OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16, schar)
313OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8, ushort)
314OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8, short)
315OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4, uint)
316OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4, int)
317OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
318
319OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld, vsx_ld, vsx_st, vsx_st)
320OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2, uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
321OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2, int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
322
323
324
325 /* de&interleave */
326 #define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
327 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
328 { vec_ld_deinterleave(ptr, a.val, b.val);} \
329 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
330 _Tpvec& b, _Tpvec& c) \
331 { vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
332 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
333 _Tpvec& c, _Tpvec& d) \
334 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
335 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
336 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
337 { vec_st_interleave(a.val, b.val, ptr); } \
338 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
339 const _Tpvec& b, const _Tpvec& c, \
340 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
341 { vec_st_interleave(a.val, b.val, c.val, ptr); } \
342 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
343 const _Tpvec& c, const _Tpvec& d, \
344 hal::StoreMode /*mode*/ =hal::STORE_UNALIGNED) \
345 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
346
347OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
348OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
349OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
350OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
351OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
352OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
353OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
354OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
355OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
356OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
357
358 /* Expand */
359 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
360 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
361 { \
362 b0.val = fh(a.val); \
363 b1.val = fl(a.val); \
364 } \
365 inline _Tpwvec v_expand_low(const _Tpvec& a) \
366 { return _Tpwvec(fh(a.val)); } \
367 inline _Tpwvec v_expand_high(const _Tpvec& a) \
368 { return _Tpwvec(fl(a.val)); } \
369 inline _Tpwvec v_load_expand(const _Tp* ptr) \
370 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
371
372OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
373OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
374OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
375OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
376OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
377OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
378
379 /* Load and zero expand a 4 byte value into the second dword, first is don't care. */
380 #if !defined(CV_COMPILER_VSX_BROKEN_ASM)
381 #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
382 #else
383 /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
384 #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
385 #endif
386
387 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
388{
389 // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
390 // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
391 vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
392 vec_uchar16 out;
393
394 _LXSIWZX(out, ptr, vec_uchar16);
395 out = vec_perm(out, out, pmu);
396 return v_uint32x4((vec_uint4)out);
397}
398
399 inline v_int32x4 v_load_expand_q(const schar* ptr)
400{
401 vec_char16 out;
402 vec_short8 outs;
403 vec_int4 outw;
404
405 _LXSIWZX(out, ptr, vec_char16);
406 outs = vec_unpackl(out);
407 outw = vec_unpackh(outs);
408 return v_int32x4(outw);
409}
410
411 /* pack */
412 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
413 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
414 { \
415 return _Tpvec(pkfnc(a.val, b.val)); \
416 } \
417 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
418 { \
419 vec_st_l8(pkfnc(a.val, a.val), ptr); \
420 } \
421 template<int n> \
422 inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
423 { \
424 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
425 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
426 return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
427 } \
428 template<int n> \
429 inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
430 { \
431 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
432 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
433 vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
434 }
435
436OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
437 vec_sr, vec_packs, vec_adds, pack)
438OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
439 vec_sra, vec_packs, vec_adds, pack)
440
441OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
442 vec_sr, vec_packs, vec_add, pack)
443OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
444 vec_sra, vec_packs, vec_add, pack)
445
446OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
447 vec_sr, vec_pack, vec_add, pack)
448OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
449 vec_sra, vec_pack, vec_add, pack)
450
451OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
452 vec_sra, vec_packsu, vec_adds, pack_u)
453OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
454 vec_sra, vec_packsu, vec_add, pack_u)
455 // Following variant is not implemented on other platforms:
456 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
457 // vec_sra, vec_packsu, vec_add, pack_u)
458
459 // pack boolean
460 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
461{
462 vec_uchar16 ab = vec_pack(a.val, b.val);
463 return v_uint8x16(ab);
464}
465
466 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
467 const v_uint32x4& c, const v_uint32x4& d)
468{
469 vec_ushort8 ab = vec_pack(a.val, b.val);
470 vec_ushort8 cd = vec_pack(c.val, d.val);
471 return v_uint8x16(vec_pack(ab, cd));
472}
473
474 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
475 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
476 const v_uint64x2& g, const v_uint64x2& h)
477{
478 vec_uint4 ab = vec_pack(a.val, b.val);
479 vec_uint4 cd = vec_pack(c.val, d.val);
480 vec_uint4 ef = vec_pack(e.val, f.val);
481 vec_uint4 gh = vec_pack(g.val, h.val);
482
483 vec_ushort8 abcd = vec_pack(ab, cd);
484 vec_ushort8 efgh = vec_pack(ef, gh);
485 return v_uint8x16(vec_pack(abcd, efgh));
486}
487
488 /* Recombine */
489 template <typename _Tpvec>
490 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
491{
492 b0.val = vec_mergeh(a0.val, a1.val);
493 b1.val = vec_mergel(a0.val, a1.val);
494}
495
496 template <typename _Tpvec>
497 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
498{ return _Tpvec(vec_mergesql(a.val, b.val)); }
499
500 template <typename _Tpvec>
501 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
502{ return _Tpvec(vec_mergesqh(a.val, b.val)); }
503
504 template <typename _Tpvec>
505 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
506{
507 c.val = vec_mergesqh(a.val, b.val);
508 d.val = vec_mergesql(a.val, b.val);
509}
510
512
513 /* Element-wise binary and unary operations */
515 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
516 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
517 { return _Tpvec(intrin(a.val, b.val)); } \
518 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
519 { a.val = intrin(a.val, b.val); return a; }
520
521OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
522OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
523OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
524OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
525OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
526OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
527OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
528OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
529OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
530OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
531OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
532OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
533OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
534OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
535OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
536OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
537OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
538OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
539OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
540OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
541OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
542OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
543OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
544OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
545OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
546OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
547
548 // saturating multiply
549 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
550 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
551 { \
552 _Tpwvec c, d; \
553 v_mul_expand(a, b, c, d); \
554 return v_pack(c, d); \
555 } \
556 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
557 { a = a * b; return a; }
558
559OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
560OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
561OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4)
562OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
563
564 template<typename Tvec, typename Twvec>
565 inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
566{
567 Twvec p0 = Twvec(vec_mule(a.val, b.val));
568 Twvec p1 = Twvec(vec_mulo(a.val, b.val));
569 v_zip(p0, p1, c, d);
570}
571
572 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
573{
574 vec_int4 p0 = vec_mule(a.val, b.val);
575 vec_int4 p1 = vec_mulo(a.val, b.val);
576 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
577 return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
578}
579 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
580{
581 vec_uint4 p0 = vec_mule(a.val, b.val);
582 vec_uint4 p1 = vec_mulo(a.val, b.val);
583 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
584 return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
585}
586
588 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
589 template<typename _Tpvec> \
590 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
591 { return _Tpvec(intrin(a.val, b.val)); }
592
593OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
594OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
595OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
596
597
598 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
599 inline _Tpvec operator << (const _Tpvec& a, int imm) \
600 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
601 inline _Tpvec operator >> (const _Tpvec& a, int imm) \
602 { return _Tpvec(shr(a.val, splfunc(imm))); } \
603 template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
604 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
605 template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
606 { return _Tpvec(shr(a.val, splfunc(imm))); }
607
608OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
609OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
610OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
611OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
612 // algebraic right shift
613OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
614OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
615OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
616OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
617
618
619 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
620 OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
621 OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
622 OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
623 inline _Tpvec operator ~ (const _Tpvec& a) \
624 { return _Tpvec(vec_not(a.val)); }
625
626OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
627OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
628OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
629OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
630OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
631OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
632OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
633OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
634OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
635OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
636
637
638 #define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
639 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
640 { return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
641
642OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
643OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
644OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
645OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
646OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
647OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
648OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
649OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
650
651
652 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
653 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
654 { return _Tpvec(vec_cmpeq(a.val, b.val)); } \
655 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
656 { return _Tpvec(vec_cmpne(a.val, b.val)); } \
657 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
658 { return _Tpvec(vec_cmplt(a.val, b.val)); } \
659 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
660 { return _Tpvec(vec_cmpgt(a.val, b.val)); } \
661 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
662 { return _Tpvec(vec_cmple(a.val, b.val)); } \
663 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
664 { return _Tpvec(vec_cmpge(a.val, b.val)); }
665
666OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
667OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
668OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
669OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
670OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
671OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
672OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
673OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
674OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
675OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
676
677 inline v_float32x4 v_not_nan(const v_float32x4& a)
678{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
679 inline v_float64x2 v_not_nan(const v_float64x2& a)
680{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
681
683OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
684OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
685
686
687 #define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
688 template<int imm> \
689 inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
690 { \
691 const int wd = imm * sizeof(typename _Tpvec::lane_type); \
692 if (wd > 15) \
693 return _Tpvec::zero(); \
694 return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
695 }
696
697 #define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
698 OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
699 OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
700
701OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
702OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
703OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
704OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
705OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
706OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
707OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
708OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
709OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
710OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
711
712 template<int imm, typename _Tpvec>
713 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
714{
715 enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
716 if (CV_SHIFT == 16)
717 return a;
718 #ifdef __IBMCPP__
719 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
720 #else
721 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
722 #endif
723}
724
725 template<int imm, typename _Tpvec>
726 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
727{
728 enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
729 if (CV_SHIFT == 16)
730 return b;
731 return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
732}
733
734 #define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
735 template<int imm> \
736 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
737 { \
738 if (imm == 1) \
739 return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
740 return imm ? b : a; \
741 }
742
743 #define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
744 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
745 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
746
747OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
748OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
749OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
750
751 /* Reverse */
752 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
753{
754 static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
755 vec_uchar16 vec = (vec_uchar16)a.val;
756 return v_uint8x16(vec_perm(vec, vec, perm));
757}
758
759 inline v_int8x16 v_reverse(const v_int8x16 &a)
760{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
761
762 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
763{
764 static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
765 vec_uchar16 vec = (vec_uchar16)a.val;
766 return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
767}
768
769 inline v_int16x8 v_reverse(const v_int16x8 &a)
770{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
771
772 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
773{
774 static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
775 vec_uchar16 vec = (vec_uchar16)a.val;
776 return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
777}
778
779 inline v_int32x4 v_reverse(const v_int32x4 &a)
780{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
781
782 inline v_float32x4 v_reverse(const v_float32x4 &a)
783{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
784
785 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
786{
787 static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
788 vec_uchar16 vec = (vec_uchar16)a.val;
789 return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
790}
791
792 inline v_int64x2 v_reverse(const v_int64x2 &a)
793{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
794
795 inline v_float64x2 v_reverse(const v_float64x2 &a)
796{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
797
798 /* Extract */
799 template<int s, typename _Tpvec>
800 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
801{ return v_rotate_right<s>(a, b); }
802
804
806 inline uint v_reduce_sum(const v_uint8x16& a)
807{
808 const vec_uint4 zero4 = vec_uint4_z;
809 vec_uint4 sum4 = vec_sum4s(a.val, zero4);
810 return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
811}
812 inline int v_reduce_sum(const v_int8x16& a)
813{
814 const vec_int4 zero4 = vec_int4_z;
815 vec_int4 sum4 = vec_sum4s(a.val, zero4);
816 return (int)vec_extract(vec_sums(sum4, zero4), 3);
817}
818 inline int v_reduce_sum(const v_int16x8& a)
819{
820 const vec_int4 zero = vec_int4_z;
821 return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
822}
823 inline uint v_reduce_sum(const v_uint16x8& a)
824{
825 const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
826 return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
827}
828
829 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
830 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
831 { \
832 const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
833 return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
834 }
835OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
836OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
837OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
838OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
839OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
840OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
841OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
842OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
843OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
844
845 inline uint64 v_reduce_sum(const v_uint64x2& a)
846{
847 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
848}
849 inline int64 v_reduce_sum(const v_int64x2& a)
850{
851 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
852}
853 inline double v_reduce_sum(const v_float64x2& a)
854{
855 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
856}
857
858 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
859 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
860 { \
861 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
862 rs = func(rs, vec_sld(rs, rs, 4)); \
863 return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
864 }
865OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
866OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
867OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
868OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
869
870 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
871 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
872 { \
873 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
874 rs = func(rs, vec_sld(rs, rs, 4)); \
875 rs = func(rs, vec_sld(rs, rs, 2)); \
876 return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
877 }
878OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
879OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
880OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
881OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
882
883 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
884 const v_float32x4& c, const v_float32x4& d)
885{
886 vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
887 ac = vec_add(ac, vec_sld(ac, ac, 8));
888
889 vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
890 bd = vec_add(bd, vec_sld(bd, bd, 8));
891 return v_float32x4(vec_mergeh(ac, bd));
892}
893
894 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
895{
896 const vec_uint4 zero4 = vec_uint4_z;
897 vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
898 return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
899}
900 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
901{
902 const vec_int4 zero4 = vec_int4_z;
903 vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
904 vec_int4 sum4 = vec_sum4s(ad, zero4);
905 return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
906}
907 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
908{
909 vec_ushort8 ad = vec_absd(a.val, b.val);
910 VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
911 return (unsigned)vec_extract(sum, 3);
912}
913 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
914{
915 const vec_int4 zero4 = vec_int4_z;
916 vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
917 vec_int4 sum4 = vec_sum4s(ad, zero4);
918 return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
919}
920 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
921{
922 const vec_uint4 ad = vec_absd(a.val, b.val);
923 const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
924 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
925}
926 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
927{
928 vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
929 return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
930}
931 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
932{
933 const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
934 const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
935 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
936}
937
939 inline v_uint8x16 v_popcount(const v_uint8x16& a)
940{ return v_uint8x16(vec_popcntu(a.val)); }
941 inline v_uint8x16 v_popcount(const v_int8x16& a)
942{ return v_uint8x16(vec_popcntu(a.val)); }
943 inline v_uint16x8 v_popcount(const v_uint16x8& a)
944{ return v_uint16x8(vec_popcntu(a.val)); }
945 inline v_uint16x8 v_popcount(const v_int16x8& a)
946{ return v_uint16x8(vec_popcntu(a.val)); }
947 inline v_uint32x4 v_popcount(const v_uint32x4& a)
948{ return v_uint32x4(vec_popcntu(a.val)); }
949 inline v_uint32x4 v_popcount(const v_int32x4& a)
950{ return v_uint32x4(vec_popcntu(a.val)); }
951 inline v_uint64x2 v_popcount(const v_uint64x2& a)
952{ return v_uint64x2(vec_popcntu(a.val)); }
953 inline v_uint64x2 v_popcount(const v_int64x2& a)
954{ return v_uint64x2(vec_popcntu(a.val)); }
955
957 inline int v_signmask(const v_uint8x16& a)
958{
959 static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
960 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
961}
962 inline int v_signmask(const v_int8x16& a)
963{ return v_signmask(v_reinterpret_as_u8(a)); }
964
965 inline int v_signmask(const v_int16x8& a)
966{
967 static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
968 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
969}
970 inline int v_signmask(const v_uint16x8& a)
971{ return v_signmask(v_reinterpret_as_s16(a)); }
972
973 inline int v_signmask(const v_int32x4& a)
974{
975 static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
976 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
977}
978 inline int v_signmask(const v_uint32x4& a)
979{ return v_signmask(v_reinterpret_as_s32(a)); }
980 inline int v_signmask(const v_float32x4& a)
981{ return v_signmask(v_reinterpret_as_s32(a)); }
982
983 inline int v_signmask(const v_int64x2& a)
984{
985 VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
986 return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
987}
988 inline int v_signmask(const v_uint64x2& a)
989{ return v_signmask(v_reinterpret_as_s64(a)); }
990 inline int v_signmask(const v_float64x2& a)
991{ return v_signmask(v_reinterpret_as_s64(a)); }
992
993 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
994 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
995 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
996 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
997 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
998 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
999 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1000 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1001 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1002 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1003
1004 template<typename _Tpvec>
1005 inline bool v_check_all(const _Tpvec& a)
1006{ return vec_all_lt(a.val, _Tpvec::zero().val); }
1007 inline bool v_check_all(const v_uint8x16& a)
1008{ return v_check_all(v_reinterpret_as_s8(a)); }
1009 inline bool v_check_all(const v_uint16x8& a)
1010{ return v_check_all(v_reinterpret_as_s16(a)); }
1011 inline bool v_check_all(const v_uint32x4& a)
1012{ return v_check_all(v_reinterpret_as_s32(a)); }
1013 inline bool v_check_all(const v_uint64x2& a)
1014{ return v_check_all(v_reinterpret_as_s64(a)); }
1015 inline bool v_check_all(const v_float32x4& a)
1016{ return v_check_all(v_reinterpret_as_s32(a)); }
1017 inline bool v_check_all(const v_float64x2& a)
1018{ return v_check_all(v_reinterpret_as_s64(a)); }
1019
1020 template<typename _Tpvec>
1021 inline bool v_check_any(const _Tpvec& a)
1022{ return vec_any_lt(a.val, _Tpvec::zero().val); }
1023 inline bool v_check_any(const v_uint8x16& a)
1024{ return v_check_any(v_reinterpret_as_s8(a)); }
1025 inline bool v_check_any(const v_uint16x8& a)
1026{ return v_check_any(v_reinterpret_as_s16(a)); }
1027 inline bool v_check_any(const v_uint32x4& a)
1028{ return v_check_any(v_reinterpret_as_s32(a)); }
1029 inline bool v_check_any(const v_uint64x2& a)
1030{ return v_check_any(v_reinterpret_as_s64(a)); }
1031 inline bool v_check_any(const v_float32x4& a)
1032{ return v_check_any(v_reinterpret_as_s32(a)); }
1033 inline bool v_check_any(const v_float64x2& a)
1034{ return v_check_any(v_reinterpret_as_s64(a)); }
1035
1037
1039 inline v_float32x4 v_sqrt(const v_float32x4& x)
1040{ return v_float32x4(vec_sqrt(x.val)); }
1041 inline v_float64x2 v_sqrt(const v_float64x2& x)
1042{ return v_float64x2(vec_sqrt(x.val)); }
1043
1044 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1045{ return v_float32x4(vec_rsqrt(x.val)); }
1046 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1047{ return v_float64x2(vec_rsqrt(x.val)); }
1048
1049 #define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
1050 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1051 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
1052 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1053 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
1054 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1055 { return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
1056 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1057 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
1058
1059OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
1060OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
1061
1062 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1063{ return a * b + c; }
1064
1065 // TODO: exp, log, sin, cos
1066
1068 inline v_uint8x16 v_abs(const v_int8x16& x)
1069{ return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
1070
1071 inline v_uint16x8 v_abs(const v_int16x8& x)
1072{ return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
1073
1074 inline v_uint32x4 v_abs(const v_int32x4& x)
1075{ return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
1076
1077 inline v_float32x4 v_abs(const v_float32x4& x)
1078{ return v_float32x4(vec_abs(x.val)); }
1079
1080 inline v_float64x2 v_abs(const v_float64x2& x)
1081{ return v_float64x2(vec_abs(x.val)); }
1082
1084 // unsigned
1085OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
1086
1087 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1088{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1089 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1090{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1091 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1092{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
1093
1094 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1095{ return v_abs(a - b); }
1096 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1097{ return v_abs(a - b); }
1098
1100 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1101{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
1102 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1103{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
1104
1106
1108 inline v_int32x4 v_round(const v_float32x4& a)
1109{ return v_int32x4(vec_cts(vec_rint(a.val))); }
1110
1111 inline v_int32x4 v_round(const v_float64x2& a)
1112{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
1113
1114 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1115{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
1116
1117 inline v_int32x4 v_floor(const v_float32x4& a)
1118{ return v_int32x4(vec_cts(vec_floor(a.val))); }
1119
1120 inline v_int32x4 v_floor(const v_float64x2& a)
1121{ return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
1122
1123 inline v_int32x4 v_ceil(const v_float32x4& a)
1124{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
1125
1126 inline v_int32x4 v_ceil(const v_float64x2& a)
1127{ return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
1128
1129 inline v_int32x4 v_trunc(const v_float32x4& a)
1130{ return v_int32x4(vec_cts(a.val)); }
1131
1132 inline v_int32x4 v_trunc(const v_float64x2& a)
1133{ return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
1134
1136 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1137{ return v_float32x4(vec_ctf(a.val)); }
1138
1139 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1140{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
1141
1142 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1143{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
1144
1145 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1146{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
1147
1148 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1149{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
1150
1151 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1152{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
1153
1154 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1155{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
1156
1157 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1158{ return v_float64x2(vec_ctd(a.val)); }
1159
1161
1162 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1163{
1164 return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
1165 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
1166}
1167 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1168{
1169 return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
1170 *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
1171}
1172 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1173{
1174 return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
1175}
1176 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
1177 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
1178 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
1179
1180 inline v_int16x8 v_lut(const short* tab, const int* idx)
1181{
1182 return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
1183}
1184 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1185{
1186 return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1187}
1188 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1189{
1190 return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1191}
1192 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
1193 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
1194 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
1195
1196 inline v_int32x4 v_lut(const int* tab, const int* idx)
1197{
1198 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1199}
1200 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1201{
1202 return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1203}
1204 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1205{
1206 return v_int32x4(vsx_ld(0, tab + idx[0]));
1207}
1208 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
1209 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
1210 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
1211
1212 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1213{
1214 return v_int64x2(tab[idx[0]], tab[idx[1]]);
1215}
1216 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1217{
1218 return v_int64x2(vsx_ld2(0, tab + idx[0]));
1219}
1220 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1221 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1222
1223 inline v_float32x4 v_lut(const float* tab, const int* idx)
1224{
1225 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1226}
1227 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
1228 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
1229
1230 inline v_float64x2 v_lut(const double* tab, const int* idx)
1231{
1232 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1233}
1234 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
1235
1236 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1237{
1238 const int idx[4] = {
1239 vec_extract(idxvec.val, 0),
1240 vec_extract(idxvec.val, 1),
1241 vec_extract(idxvec.val, 2),
1242 vec_extract(idxvec.val, 3)
1243 };
1244 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1245}
1246
1247 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1248{
1249 const int idx[4] = {
1250 vec_extract(idxvec.val, 0),
1251 vec_extract(idxvec.val, 1),
1252 vec_extract(idxvec.val, 2),
1253 vec_extract(idxvec.val, 3)
1254 };
1255 return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1256}
1257
1258 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1259{
1260 const int idx[4] = {
1261 vec_extract(idxvec.val, 0),
1262 vec_extract(idxvec.val, 1),
1263 vec_extract(idxvec.val, 2),
1264 vec_extract(idxvec.val, 3)
1265 };
1266 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1267}
1268
1269 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1270{
1271 const int idx[2] = {
1272 vec_extract(idxvec.val, 0),
1273 vec_extract(idxvec.val, 1)
1274 };
1275 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1276}
1277
1278 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1279{
1280 vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
1281 vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
1282 vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
1283 vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
1284 vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
1285 vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
1286 x.val = vec_mergeh(xy02, xy13);
1287 y.val = vec_mergel(xy02, xy13);
1288}
1289 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1290{
1291 vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
1292 vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
1293 x.val = vec_mergeh(xy0, xy1);
1294 y.val = vec_mergel(xy0, xy1);
1295}
1296
1297 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1298{
1299 static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
1300 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1301}
1302 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1303{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1304
1305 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1306{
1307 static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
1308 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1309}
1310 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
1311{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1312
1313 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1314{
1315 static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
1316 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1317}
1318 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
1319{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1320
1321 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1322{
1323 static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
1324 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1325}
1326 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
1327{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1328
1329 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1330{
1331 static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
1332 return v_int32x4(vec_perm(vec.val, vec.val, perm));
1333}
1334 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
1335{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1336 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
1337{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1338
1339 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1340{
1341 static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
1342 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1343}
1344 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
1345{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1346
1347 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1348{
1349 static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
1350 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1351}
1352 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
1353{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1354
1355 inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
1356{ return vec; }
1357 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
1358{ return vec; }
1359 inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
1360{ return vec; }
1361
1363
1364 inline v_float32x4 v_load_expand(const float16_t* ptr)
1365{
1366 vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
1367 #if CV_VSX3 && defined(vec_extract_fp_from_shorth)
1368 return v_float32x4(vec_extract_fp_from_shorth(vf16));
1369 #elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1370 vec_float4 vf32;
1371 __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
1372 return v_float32x4(vf32);
1373 #else
1374 const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
1375 const vec_int4 signmask = vec_int4_sp(0x80000000);
1376 const vec_int4 maxexp = vec_int4_sp(0x7c000000);
1377 const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
1378
1379 vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
1380 vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
1381 vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
1382 vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
1383
1384 t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
1385 vec_bint4 zmask = vec_cmpeq(e, z);
1386 vec_int4 ft = vec_sel(t, zt, zmask);
1387 return v_float32x4(vec_float4_c(vec_or(ft, sign)));
1388 #endif
1389}
1390
1391 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
1392{
1393 // fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
1394 #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1395 vec_ushort8 vf16;
1396 __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
1397 vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
1398 #else
1399 const vec_int4 signmask = vec_int4_sp(0x80000000);
1400 const vec_int4 rval = vec_int4_sp(0x3f000000);
1401
1402 vec_int4 t = vec_int4_c(v.val);
1403 vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
1404 t = vec_and(vec_nor(signmask, signmask), t);
1405
1406 vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
1407 vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
1408 vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
1409 vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
1410 vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
1411 tt = vec_sub(tt, rval);
1412 vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
1413 vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
1414 nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
1415 t = vec_sel(nt, tt, tinymask);
1416 t = vec_sel(naninf, t, finitemask);
1417 t = vec_or(t, sign);
1418 vec_st_l8(vec_packs(t, t), ptr);
1419 #endif
1420}
1421
1422 inline void v_cleanup() {}
1423
1424
1429
1431 // 16 >> 32
1432 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
1433{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
1434 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1435{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
1436
1437 // 32 >> 64
1438 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
1439{
1440 vec_dword2 even = vec_mule(a.val, b.val);
1441 vec_dword2 odd = vec_mulo(a.val, b.val);
1442 return v_int64x2(vec_add(even, odd));
1443}
1444 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1445{ return v_dotprod(a, b) + c; }
1446
1447 // 8 >> 32
1448 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1449{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
1450 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1451{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
1452
1453 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1454{
1455 const vec_ushort8 eight = vec_ushort8_sp(8);
1456 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1457 vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1458 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1459 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1460 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1461}
1462
1463 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1464{
1465 const vec_ushort8 eight = vec_ushort8_sp(8);
1466 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1467 vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1468 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1469 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1470 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
1471}
1472
1473 // 16 >> 64
1474 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1475{
1476 const vec_uint4 zero = vec_uint4_z;
1477 vec_uint4 even = vec_mule(a.val, b.val);
1478 vec_uint4 odd = vec_mulo(a.val, b.val);
1479 vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
1480 vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
1481 vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
1482 vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
1483 vec_udword2 s0 = vec_add(e0, o0);
1484 vec_udword2 s1 = vec_add(e1, o1);
1485 return v_uint64x2(vec_add(s0, s1));
1486}
1487 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1488{ return v_dotprod_expand(a, b) + c; }
1489
1490 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1491{
1492 v_int32x4 prod = v_dotprod(a, b);
1493 v_int64x2 c, d;
1494 v_expand(prod, c, d);
1495 return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
1496}
1497 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1498{ return v_dotprod_expand(a, b) + c; }
1499
1500 // 32 >> 64f
1501 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1502{ return v_cvt_f64(v_dotprod(a, b)); }
1503 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1504{ return v_dotprod_expand(a, b) + c; }
1505
1507
1508 // 16 >> 32
1509 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1510{ return v_dotprod(a, b); }
1511 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1512{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
1513 // 32 >> 64
1514 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1515{ return v_dotprod(a, b); }
1516 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1517{ return v_dotprod(a, b, c); }
1518
1519 // 8 >> 32
1520 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1521{ return v_dotprod_expand(a, b); }
1522 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1523{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
1524
1525 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1526{
1527 vec_short8 a0 = vec_unpackh(a.val);
1528 vec_short8 a1 = vec_unpackl(a.val);
1529 vec_short8 b0 = vec_unpackh(b.val);
1530 vec_short8 b1 = vec_unpackl(b.val);
1531 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1532}
1533 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1534{ return v_dotprod_expand_fast(a, b) + c; }
1535
1536 // 16 >> 64
1537 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1538{ return v_dotprod_expand(a, b); }
1539 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1540{ return v_dotprod_expand(a, b, c); }
1541
1542 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1543{
1544 v_int32x4 prod = v_dotprod(a, b);
1545 v_int64x2 c, d;
1546 v_expand(prod, c, d);
1547 return c + d;
1548}
1549 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1550{ return v_dotprod_expand_fast(a, b) + c; }
1551
1552 // 32 >> 64f
1553 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1554{ return v_dotprod_expand(a, b); }
1555 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1556{ return v_dotprod_expand(a, b, c); }
1557
1558 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1559 const v_float32x4& m1, const v_float32x4& m2,
1560 const v_float32x4& m3)
1561{
1562 const vec_float4 v0 = vec_splat(v.val, 0);
1563 const vec_float4 v1 = vec_splat(v.val, 1);
1564 const vec_float4 v2 = vec_splat(v.val, 2);
1565 VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
1566 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
1567}
1568
1569 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1570 const v_float32x4& m1, const v_float32x4& m2,
1571 const v_float32x4& a)
1572{
1573 const vec_float4 v0 = vec_splat(v.val, 0);
1574 const vec_float4 v1 = vec_splat(v.val, 1);
1575 const vec_float4 v2 = vec_splat(v.val, 2);
1576 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
1577}
1578
1579 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
1580 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1581 const _Tpvec& a2, const _Tpvec& a3, \
1582 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1583 { \
1584 _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
1585 _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
1586 b0.val = vec_mergeh(a02, a13); \
1587 b1.val = vec_mergel(a02, a13); \
1588 a02 = vec_mergel(a0.val, a2.val); \
1589 a13 = vec_mergel(a1.val, a3.val); \
1590 b2.val = vec_mergeh(a02, a13); \
1591 b3.val = vec_mergel(a02, a13); \
1592 }
1593OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
1594OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
1595OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
1596
1597 template<int i, typename Tvec>
1598 inline Tvec v_broadcast_element(const Tvec& v)
1599{ return Tvec(vec_splat(v.val, i)); }
1600
1601
1602CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1603
1605
1606}
1607
1608 #endif // OPENCV_HAL_VSX_HPP
CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element maximum of two arrays or an array and a scalar.
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
Calculates per-element minimum of two arrays or an array and a scalar.
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero
Definition: intrin_cpp.hpp:1436
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors
Definition: intrin_cpp.hpp:2307
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication
Definition: intrin_cpp.hpp:3196
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements
Definition: intrin_cpp.hpp:2427
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values
Definition: intrin_cpp.hpp:490
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values
Definition: intrin_cpp.hpp:488
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask
Definition: intrin_cpp.hpp:1395
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors
Definition: intrin_cpp.hpp:1557
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values
Definition: intrin_cpp.hpp:506
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand
Definition: intrin_cpp.hpp:1145
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values
Definition: intrin_cpp.hpp:1377
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements
Definition: intrin_cpp.hpp:2465
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values
Definition: intrin_cpp.hpp:492
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements
Definition: intrin_cpp.hpp:2452
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements
Definition: intrin_cpp.hpp:1080
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index
Definition: intrin_cpp.hpp:1412
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order
Definition: intrin_cpp.hpp:2346
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand
Definition: intrin_cpp.hpp:1875
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values
Definition: intrin_cpp.hpp:498
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation
Definition: intrin_cpp.hpp:956
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector
Definition: intrin_cpp.hpp:1338
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma
Definition: intrin_cpp.hpp:1060
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements
Definition: intrin_cpp.hpp:2478
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values
Definition: intrin_cpp.hpp:496
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root
Definition: intrin_cpp.hpp:1010
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand
Definition: intrin_cpp.hpp:1188
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector
Definition: intrin_cpp.hpp:2587
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors
Definition: intrin_cpp.hpp:2325
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums
Definition: intrin_cpp.hpp:1356
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand
Definition: intrin_cpp.hpp:1219
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector
Definition: intrin_cpp.hpp:2416
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory
Definition: intrin_cpp.hpp:1587
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double
Definition: intrin_cpp.hpp:2576
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand
Definition: intrin_cpp.hpp:1964
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type
Definition: intrin_cpp.hpp:1477
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3114
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference
Definition: intrin_cpp.hpp:997
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values
Definition: intrin_cpp.hpp:504
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements
Definition: intrin_cpp.hpp:1119
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part
Definition: intrin_cpp.hpp:1236
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors
Definition: intrin_cpp.hpp:2285
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:500
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float
Definition: intrin_cpp.hpp:2537
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero
Definition: intrin_cpp.hpp:1424
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add
Definition: intrin_cpp.hpp:3226
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison
Definition: intrin_cpp.hpp:893
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Definition: intrin_cpp.hpp:827
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values
Definition: intrin_cpp.hpp:494
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:502
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract
Definition: intrin_cpp.hpp:2374
cv
"black box" representation of the file storage associated with a file on disk.
Definition: aruco.hpp:75