5
#ifndef OPENCV_HAL_VSX_UTILS_HPP
6
#define OPENCV_HAL_VSX_UTILS_HPP
8
#include "opencv2/core/cvdef.h"
18
#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
19
#define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
20
#define __VSX_S4__(c, v) (c){v, v, v, v}
21
#define __VSX_S2__(c, v) (c){v, v}
23
typedef
__vector
unsigned
char
vec_uchar16;
24
#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
25
#define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, (unsigned char)c))
26
#define vec_uchar16_c(v) ((vec_uchar16)(v))
27
#define vec_uchar16_z vec_uchar16_sp(0)
29
typedef
__vector
signed
char
vec_char16;
30
#define vec_char16_set(...) (vec_char16){__VA_ARGS__}
31
#define vec_char16_sp(c) (__VSX_S16__(vec_char16, (signed char)c))
32
#define vec_char16_c(v) ((vec_char16)(v))
33
#define vec_char16_z vec_char16_sp(0)
35
typedef
__vector
unsigned
short
vec_ushort8;
36
#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
37
#define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, (unsigned short)c))
38
#define vec_ushort8_c(v) ((vec_ushort8)(v))
39
#define vec_ushort8_z vec_ushort8_sp(0)
41
typedef
__vector
signed
short
vec_short8;
42
#define vec_short8_set(...) (vec_short8){__VA_ARGS__}
43
#define vec_short8_sp(c) (__VSX_S8__(vec_short8, (signed short)c))
44
#define vec_short8_c(v) ((vec_short8)(v))
45
#define vec_short8_z vec_short8_sp(0)
47
typedef
__vector
unsigned
int
vec_uint4;
48
#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
49
#define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, (unsigned int)c))
50
#define vec_uint4_c(v) ((vec_uint4)(v))
51
#define vec_uint4_z vec_uint4_sp(0)
53
typedef
__vector
signed
int
vec_int4;
54
#define vec_int4_set(...) (vec_int4){__VA_ARGS__}
55
#define vec_int4_sp(c) (__VSX_S4__(vec_int4, (signed int)c))
56
#define vec_int4_c(v) ((vec_int4)(v))
57
#define vec_int4_z vec_int4_sp(0)
59
typedef
__vector
float
vec_float4;
60
#define vec_float4_set(...) (vec_float4){__VA_ARGS__}
61
#define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
62
#define vec_float4_c(v) ((vec_float4)(v))
63
#define vec_float4_z vec_float4_sp(0)
65
typedef
__vector
unsigned
long
long
vec_udword2;
66
#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
67
#define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, (unsigned long long)c))
68
#define vec_udword2_c(v) ((vec_udword2)(v))
69
#define vec_udword2_z vec_udword2_sp(0)
71
typedef
__vector
signed
long
long
vec_dword2;
72
#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
73
#define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, (signed long long)c))
74
#define vec_dword2_c(v) ((vec_dword2)(v))
75
#define vec_dword2_z vec_dword2_sp(0)
77
typedef
__vector
double
vec_double2;
78
#define vec_double2_set(...) (vec_double2){__VA_ARGS__}
79
#define vec_double2_c(v) ((vec_double2)(v))
80
#define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
81
#define vec_double2_z vec_double2_sp(0)
83
#define vec_bchar16 __vector __bool char
84
#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
85
#define vec_bchar16_c(v) ((vec_bchar16)(v))
87
#define vec_bshort8 __vector __bool short
88
#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
89
#define vec_bshort8_c(v) ((vec_bshort8)(v))
91
#define vec_bint4 __vector __bool int
92
#define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
93
#define vec_bint4_c(v) ((vec_bint4)(v))
95
#define vec_bdword2 __vector __bool long long
96
#define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
97
#define vec_bdword2_c(v) ((vec_bdword2)(v))
99
#define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
101
#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
102
VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
104
#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
105
VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
110
#if defined(__GNUG__) && !defined(__clang__)
113
#define VSX_IMPL_1RG(rt, rg, opc, fnm) \
114
VSX_FINLINE(rt) fnm(const rg& a) \
115
{ rt rs; __asm__ __volatile__(#opc" %x0,%x1"
: "=wa" (rs) : "wa" (a)); return rs; }
117
#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
118
VSX_FINLINE(rt) fnm(const rg& a) \
119
{ rt rs; __asm__ __volatile__(#opc" %0,%1"
: "=v" (rs) : "v" (a)); return rs; }
121
#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
122
VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
123
{ rt rs; __asm__ __volatile__(fopc : "=v"
(rs) : "v" (a), "v" (b)); return rs; }
125
#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
137
VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mule, __builtin_vec_mule)
138
VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mule, __builtin_vec_mule)
139
VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mule, __builtin_vec_mule)
140
VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mule, __builtin_vec_mule)
141
VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mulo, __builtin_vec_mulo)
142
VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mulo, __builtin_vec_mulo)
143
VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mulo, __builtin_vec_mulo)
144
VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)
147
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
148
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
149
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
150
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
163
# define VSX_IMPL_MULH(Tvec, cperm) \
164
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
166
static const vec_uchar16 ev_od = {cperm}; \
167
return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
169
#define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
170
VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
171
VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
172
#define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
173
VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
174
VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
176
VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
177
VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
179
VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
180
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
181
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
182
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
197
# define vec_cmple(a, b) vec_cmpge(b, a)
198
# define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
199
VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
201
VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
202
VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
203
VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
204
VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
205
VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
206
VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
207
VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
208
VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
211
VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
212
VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
217
VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
219
VSX_FINLINE(vec_bdword2) vec_nor(
const
vec_bdword2& a,
const
vec_bdword2& b)
220
{
return
vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
224
VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
225
VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
226
VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
227
VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
229
VSX_IMPL_2VRG_F(vec_int4, vec_dword2,
"vpksdss %0,%2,%1", vec_packs)
230
VSX_IMPL_2VRG_F(vec_uint4, vec_udword2,
"vpkudus %0,%2,%1", vec_packs)
235
# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
238
VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
239
VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
241
# define vec_permi vec_xxpermdi
246
# define vec_sldw __builtin_vsx_xxsldwi
250VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
251VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
252VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
253VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
254VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
255VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
256VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
257VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
260VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
261VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
265VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
266VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
267VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
268VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
272VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
273VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
274VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
275VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
279VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
280VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
284VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
285VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
289VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
290VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
294VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
295VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
299
# define vec_xl vec_vsx_ld
300
# define vec_xst vec_vsx_st
308
#if defined(__clang__) && !defined(__IBMCPP__)
324
#define VSX_IMPL_CONVERT(rt, rg, fnm) \
325
VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
327
#if __clang_major__ < 5
329
# define VSX_IMPL_CLANG_4_PERMI(Tvec) \
330
VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
335
return vec_mergeh(a, b); \
337
return vec_mergel(vec_mergeh(a, a), b); \
339
return vec_mergeh(vec_mergel(a, a), b); \
341
return vec_mergel(a, b); \
344
VSX_IMPL_CLANG_4_PERMI(vec_udword2)
345
VSX_IMPL_CLANG_4_PERMI(vec_dword2)
346
VSX_IMPL_CLANG_4_PERMI(vec_double2)
349
# define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
352
# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
357
# define vec_sldw vec_xxsldwi
362
VSX_FINLINE(vec_float4) vec_rsqrt(
const
vec_float4& a)
363
{
return
vec_div(vec_float4_sp(1), vec_sqrt(a)); }
365
VSX_FINLINE(vec_double2) vec_rsqrt(
const
vec_double2& a)
366
{
return
vec_div(vec_double2_sp(1), vec_sqrt(a)); }
370VSX_FINLINE(vec_dword2) vec_promote(
long
long
a,
int
b)
372
vec_dword2 ret = vec_dword2_z;
377VSX_FINLINE(vec_udword2) vec_promote(
unsigned
long
long
a,
int
b)
379
vec_udword2 ret = vec_udword2_z;
385
#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
386
VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a) \
387
{ return ucast(vec_popcnt(a)); }
388VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
389VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
390VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
391VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
393VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
394VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
395VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
396VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
399VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
400VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
406VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
407VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
409VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
410VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
413
#if __clang_major__ > 4
416VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
417VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
418VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
419VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
422
#if __clang_major__ > 4
425VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
426VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
429
#if __clang_major__ > 4
432VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
433VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
439VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
441VSX_FINLINE(vec_dword2) vec_ctslo(
const
vec_float4& a)
442{
return
vec_ctsl(vec_cvfo(a)); }
448VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
450VSX_FINLINE(vec_udword2) vec_ctulo(
const
vec_float4& a)
451{
return
vec_ctul(vec_cvfo(a)); }
458
#if defined(__GNUG__) && !defined(__IBMCPP__)
464
#define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
465
VSX_FINLINE(rt) fnm(const rg& a) \
466
{ return fn2(vec_sldw(a, a, 1)); }
468VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
469VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
470VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)
472VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
473VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
475
#define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
476
VSX_FINLINE(rt) fnm(const rg& a) \
479
return vec_sldw(v4, v4, 3); \
482VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
483VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
484VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
486VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
487VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)
494
#if !defined(__clang__) || __clang_major__ > 4
496
# define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2) \
497
VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
499
assert(only_truncate == 0); \
500
CV_UNUSED(only_truncate); \
503
VSX_IMPL_CONV_2VARIANT(vec_int4, vec_float4, vec_cts, vec_cts)
504
VSX_IMPL_CONV_2VARIANT(vec_uint4, vec_float4, vec_ctu, vec_ctu)
505
VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4, vec_ctf, vec_ctf)
506
VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4, vec_ctf, vec_ctf)
509
VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
517
#if defined(__IBMCPP__)
520
#define vec_popcntu vec_popcnt
524
#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
525
VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
527VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
528VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
529VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
530VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
532VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
533VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
534VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
535VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
537VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
538VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
540VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
541VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
543VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
544VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
546VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
547VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
551
#define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
552
VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
554VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
555VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
556VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
558VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
559VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
561
#define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2) \
562
VSX_FINLINE(rt) fnm(const rg& a) \
565
return vec_sldw(v4, v4, 1); \
568VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
569VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
570VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
572VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
573VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)
578
#if defined(__GNUG__) && !defined(__clang__)
579
# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
581
# define VSX_UNUSED(Tvec) Tvec
585
#if defined(__clang__) || defined(__IBMCPP__)
586
VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
587
{
return
vec_splats((
unsigned
long
long) v); }
589
VSX_FINLINE(vec_dword2) vec_splats(int64 v)
590
{
return
vec_splats((
long
long) v); }
592
VSX_FINLINE(vec_udword2) vec_promote(uint64 a,
int
b)
593
{
return
vec_promote((
unsigned
long
long) a, b); }
595
VSX_FINLINE(vec_dword2) vec_promote(int64 a,
int
b)
596
{
return
vec_promote((
long
long) a, b); }
610
#if defined(__clang__) && !defined(__IBMCPP__)
611
# define vsx_ldf vec_vsx_ld
612
# define vsx_stf vec_vsx_st
614
# define vsx_ldf vec_xl
615
# define vsx_stf vec_xst
618
#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
619
#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
620
#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
629
#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
630
VSX_FINLINE(vec_udword2) vsx_ld2(
long
o,
const
uint64* p)
631
{
return
vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (
unsigned
int*)p)); }
633
VSX_FINLINE(vec_dword2) vsx_ld2(
long
o,
const
int64* p)
634
{
return
vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (
int*)p)); }
636
VSX_FINLINE(
void) vsx_st2(
const
vec_udword2& vec,
long
o, uint64* p)
637
{ vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (
unsigned
int*)p); }
639
VSX_FINLINE(
void) vsx_st2(
const
vec_dword2& vec,
long
o, int64* p)
640
{ vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (
int*)p); }
642
VSX_FINLINE(vec_udword2) vsx_ld2(
long
o,
const
uint64* p)
643
{
return
vsx_ldf(VSX_OFFSET(o, p), (
unsigned
long
long*)p); }
645
VSX_FINLINE(vec_dword2) vsx_ld2(
long
o,
const
int64* p)
646
{
return
vsx_ldf(VSX_OFFSET(o, p), (
long
long*)p); }
648
VSX_FINLINE(
void) vsx_st2(
const
vec_udword2& vec,
long
o, uint64* p)
649
{ vsx_stf(vec, VSX_OFFSET(o, p), (
unsigned
long
long*)p); }
651
VSX_FINLINE(
void) vsx_st2(
const
vec_dword2& vec,
long
o, int64* p)
652
{ vsx_stf(vec, VSX_OFFSET(o, p), (
long
long*)p); }
656
#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
659
#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
662
#define VSX_IMPL_LOAD_L8(Tvec, Tp) \
663
VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) \
664
{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
666VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
667VSX_IMPL_LOAD_L8(vec_char16, schar)
668VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
669VSX_IMPL_LOAD_L8(vec_short8,
short)
670VSX_IMPL_LOAD_L8(vec_uint4, uint)
671VSX_IMPL_LOAD_L8(vec_int4,
int)
672VSX_IMPL_LOAD_L8(vec_float4,
float)
673VSX_IMPL_LOAD_L8(vec_udword2, uint64)
674VSX_IMPL_LOAD_L8(vec_dword2, int64)
675VSX_IMPL_LOAD_L8(vec_double2,
double)
678
#define vec_not(a) vec_nor(a, a)
683
# define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
688
# define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
695
#define VSX_IMPL_UNPACKU(rt, rg, zero) \
696
VSX_FINLINE(rt) vec_unpacklu(const rg& a) \
697
{ return (rt)(vec_mergel(a, zero)); } \
698
VSX_FINLINE(rt) vec_unpackhu(const rg& a) \
699
{ return (rt)(vec_mergeh(a, zero)); }
701VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
702VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
703VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
709
#define VSX_IMPL_PERM(rt, fnm, ...) \
710
VSX_FINLINE(rt) fnm(const rt& a, const rt& b) \
711
{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
714
#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
715
#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
716VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
717VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
718VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
719VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
721
#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
722
#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
723VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
724VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
725VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
726VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
728
#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
729
#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
730VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
731VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
732VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
733VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
734VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
735VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
737VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
738VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
739VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
740VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
741VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
742VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
748
#define VSX_IMPL_MERGESQHL(Tvec) \
749
VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
750
{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
751
VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
752
{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
753VSX_IMPL_MERGESQHL(vec_uchar16)
754VSX_IMPL_MERGESQHL(vec_char16)
755VSX_IMPL_MERGESQHL(vec_ushort8)
756VSX_IMPL_MERGESQHL(vec_short8)
757VSX_IMPL_MERGESQHL(vec_uint4)
758VSX_IMPL_MERGESQHL(vec_int4)
759VSX_IMPL_MERGESQHL(vec_float4)
760VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
761VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
762VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
763VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
764VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
765VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
769
#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
770
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
772
vsx_stf(vec_mergeh(a, b), 0, ptr); \
773
vsx_stf(vec_mergel(a, b), 16, ptr); \
775
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
776
const Tvec& c, const Tvec& d, Tp* ptr) \
778
Tvec ac = vec_mergeh(a, c); \
779
Tvec bd = vec_mergeh(b, d); \
780
vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
781
vsx_stf(vec_mergel(ac, bd), 16, ptr); \
782
ac = vec_mergel(a, c); \
783
bd = vec_mergel(b, d); \
784
vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
785
vsx_stf(vec_mergel(ac, bd), 48, ptr); \
787VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
788VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
789VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
790VSX_IMPL_ST_INTERLEAVE(
short, vec_short8)
791VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
792VSX_IMPL_ST_INTERLEAVE(
int, vec_int4)
793VSX_IMPL_ST_INTERLEAVE(
float, vec_float4)
796
#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
797
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
799
Tvec v0 = vsx_ld(0, ptr); \
800
Tvec v1 = vsx_ld(16, ptr); \
801
a = vec_mergesqe(v0, v1); \
802
b = vec_mergesqo(v0, v1); \
804
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
807
Tvec v0 = vsx_ld(0, ptr); \
808
Tvec v1 = vsx_ld(16, ptr); \
809
Tvec v2 = vsx_ld(32, ptr); \
810
Tvec v3 = vsx_ld(48, ptr); \
811
Tvec m0 = vec_mergesqe(v0, v1); \
812
Tvec m1 = vec_mergesqe(v2, v3); \
813
a = vec_mergesqe(m0, m1); \
814
c = vec_mergesqo(m0, m1); \
815
m0 = vec_mergesqo(v0, v1); \
816
m1 = vec_mergesqo(v2, v3); \
817
b = vec_mergesqe(m0, m1); \
818
d = vec_mergesqo(m0, m1); \
820VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
821VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
824
#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
825
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
827
Tvec v0 = vsx_ld(0, ptr); \
828
Tvec v1 = vsx_ld(8, ptr); \
829
a = vec_mergesqe(v0, v1); \
830
b = vec_mergesqo(v0, v1); \
832
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
835
Tvec v0 = vsx_ld(0, ptr); \
836
Tvec v1 = vsx_ld(8, ptr); \
837
Tvec m0 = vec_mergeh(v0, v1); \
838
Tvec m1 = vec_mergel(v0, v1); \
839
Tvec ab0 = vec_mergeh(m0, m1); \
840
Tvec cd0 = vec_mergel(m0, m1); \
841
v0 = vsx_ld(16, ptr); \
842
v1 = vsx_ld(24, ptr); \
843
m0 = vec_mergeh(v0, v1); \
844
m1 = vec_mergel(v0, v1); \
845
Tvec ab1 = vec_mergeh(m0, m1); \
846
Tvec cd1 = vec_mergel(m0, m1); \
847
a = vec_mergesqh(ab0, ab1); \
848
b = vec_mergesql(ab0, ab1); \
849
c = vec_mergesqh(cd0, cd1); \
850
d = vec_mergesql(cd0, cd1); \
852VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
853VSX_IMPL_ST_DINTERLEAVE_16(
short, vec_short8)
856
#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
857
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
859
a = vsx_ld(0, ptr); \
860
b = vsx_ld(4, ptr); \
861
Tvec m0 = vec_mergeh(a, b); \
862
Tvec m1 = vec_mergel(a, b); \
863
a = vec_mergeh(m0, m1); \
864
b = vec_mergel(m0, m1); \
866
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
869
Tvec v0 = vsx_ld(0, ptr); \
870
Tvec v1 = vsx_ld(4, ptr); \
871
Tvec v2 = vsx_ld(8, ptr); \
872
Tvec v3 = vsx_ld(12, ptr); \
873
Tvec m0 = vec_mergeh(v0, v2); \
874
Tvec m1 = vec_mergeh(v1, v3); \
875
a = vec_mergeh(m0, m1); \
876
b = vec_mergel(m0, m1); \
877
m0 = vec_mergel(v0, v2); \
878
m1 = vec_mergel(v1, v3); \
879
c = vec_mergeh(m0, m1); \
880
d = vec_mergel(m0, m1); \
882VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
883VSX_IMPL_ST_DINTERLEAVE_32(
int, vec_int4)
884VSX_IMPL_ST_DINTERLEAVE_32(
float, vec_float4)
887
#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
888
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
890
st_func(vec_mergeh(a, b), 0, ptr); \
891
st_func(vec_mergel(a, b), 2, ptr); \
893
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
894
const Tvec& c, const Tvec& d, Tp* ptr) \
896
st_func(vec_mergeh(a, b), 0, ptr); \
897
st_func(vec_mergeh(c, d), 2, ptr); \
898
st_func(vec_mergel(a, b), 4, ptr); \
899
st_func(vec_mergel(c, d), 6, ptr); \
901
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
903
Tvec m0 = ld_func(0, ptr); \
904
Tvec m1 = ld_func(2, ptr); \
905
a = vec_mergeh(m0, m1); \
906
b = vec_mergel(m0, m1); \
908
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
911
Tvec v0 = ld_func(0, ptr); \
912
Tvec v1 = ld_func(2, ptr); \
913
Tvec v2 = ld_func(4, ptr); \
914
Tvec v3 = ld_func(6, ptr); \
915
a = vec_mergeh(v0, v2); \
916
b = vec_mergel(v0, v2); \
917
c = vec_mergeh(v1, v3); \
918
d = vec_mergel(v1, v3); \
920VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
921VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
922VSX_IMPL_ST_D_INTERLEAVE_64(
double, vec_double2, vsx_ld, vsx_st)
925
#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
926
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
927
const Tvec& c, Tp* ptr) \
929
static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
930
static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
931
vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
932
static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
933
static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
934
vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
935
static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
936
static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
937
vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
939
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
941
Tvec v1 = vsx_ld(0, ptr); \
942
Tvec v2 = vsx_ld(16, ptr); \
943
Tvec v3 = vsx_ld(32, ptr); \
944
static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
945
static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
946
a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
947
static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
948
static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
949
b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
950
static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
951
static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
952
c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
954VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
955VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
957
#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
958
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
959
const Tvec& c, Tp* ptr) \
961
static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
962
static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
963
vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
964
static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
965
static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
966
vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
967
static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
968
static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
969
vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
971
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
973
Tvec v1 = vsx_ld(0, ptr); \
974
Tvec v2 = vsx_ld(8, ptr); \
975
Tvec v3 = vsx_ld(16, ptr); \
976
static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
977
static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
978
a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
979
static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
980
static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
981
b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
982
static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
983
static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
984
c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
986VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
987VSX_IMPL_ST_INTERLEAVE_3CH_8(
short, vec_short8)
989
#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
990
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
991
const Tvec& c, Tp* ptr) \
993
Tvec hbc = vec_mergeh(b, c); \
994
static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
995
vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
996
Tvec lab = vec_mergel(a, b); \
997
vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
998
static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
999
vsx_st(vec_perm(c, lab, clab), 8, ptr); \
1001
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
1003
Tvec v1 = vsx_ld(0, ptr); \
1004
Tvec v2 = vsx_ld(4, ptr); \
1005
Tvec v3 = vsx_ld(8, ptr); \
1006
static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
1007
a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
1008
static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
1009
b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
1010
c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
1012VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
1013VSX_IMPL_ST_INTERLEAVE_3CH_4(
int, vec_int4)
1014VSX_IMPL_ST_INTERLEAVE_3CH_4(
float, vec_float4)
1016
#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
1017
VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
1018
const Tvec& c, Tp* ptr) \
1020
st_func(vec_mergeh(a, b), 0, ptr); \
1021
st_func(vec_permi(c, a, 1), 2, ptr); \
1022
st_func(vec_mergel(b, c), 4, ptr); \
1024
VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
1027
Tvec v1 = ld_func(0, ptr); \
1028
Tvec v2 = ld_func(2, ptr); \
1029
Tvec v3 = ld_func(4, ptr); \
1030
a = vec_permi(v1, v2, 1); \
1031
b = vec_permi(v1, v3, 2); \
1032
c = vec_permi(v2, v3, 1); \
1034VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
1035VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
1036VSX_IMPL_ST_INTERLEAVE_3CH_2(
double, vec_double2, vsx_ld, vsx_st)