5
#ifndef OPENCV_CORE_HAL_MSA_MACROS_H
6
#define OPENCV_CORE_HAL_MSA_MACROS_H
17
typedef
signed
char
v8i8 __attribute__ ((vector_size(8), aligned(8)));
18
typedef
unsigned
char
v8u8 __attribute__ ((vector_size(8), aligned(8)));
19
typedef
short
v4i16 __attribute__ ((vector_size(8), aligned(8)));
20
typedef
unsigned
short
v4u16 __attribute__ ((vector_size(8), aligned(8)));
21
typedef
int
v2i32 __attribute__ ((vector_size(8), aligned(8)));
22
typedef
unsigned
int
v2u32 __attribute__ ((vector_size(8), aligned(8)));
23
typedef
long
long
v1i64 __attribute__ ((vector_size(8), aligned(8)));
24
typedef
unsigned
long
long
v1u64 __attribute__ ((vector_size(8), aligned(8)));
25
typedef
float
v2f32 __attribute__ ((vector_size(8), aligned(8)));
26
typedef
double
v1f64 __attribute__ ((vector_size(8), aligned(8)));
30
#define msa_ld1_s8(__a) (*((v8i8*)(__a)))
31
#define msa_ld1_s16(__a) (*((v4i16*)(__a)))
32
#define msa_ld1_s32(__a) (*((v2i32*)(__a)))
33
#define msa_ld1_s64(__a) (*((v1i64*)(__a)))
34
#define msa_ld1_u8(__a) (*((v8u8*)(__a)))
35
#define msa_ld1_u16(__a) (*((v4u16*)(__a)))
36
#define msa_ld1_u32(__a) (*((v2u32*)(__a)))
37
#define msa_ld1_u64(__a) (*((v1u64*)(__a)))
38
#define msa_ld1_f32(__a) (*((v2f32*)(__a)))
39
#define msa_ld1_f64(__a) (*((v1f64*)(__a)))
42
#define msa_ld1q_s8(__a) ((v16i8)__builtin_msa_ld_b(__a, 0))
43
#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
44
#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
45
#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
46
#define msa_ld1q_u8(__a) ((v16u8)__builtin_msa_ld_b(__a, 0))
47
#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
48
#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
49
#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
50
#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
51
#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
54
#define msa_st1_s8(__a, __b) (*((v8i8*)(__a)) = __b)
55
#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
56
#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
57
#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
58
#define msa_st1_u8(__a, __b) (*((v8u8*)(__a)) = __b)
59
#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
60
#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
61
#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
62
#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
63
#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
66
#define msa_st1q_s8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
67
#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
68
#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
69
#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
70
#define msa_st1q_u8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
71
#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
72
#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
73
#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
74
#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
75
#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
78
#define msa_st1_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = __b[__c])
79
#define msa_st1_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = __b[__c])
80
#define msa_st1_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __b[__c])
81
#define msa_st1_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __b[__c])
82
#define msa_st1_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = __b[__c])
83
#define msa_st1_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = __b[__c])
84
#define msa_st1_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __b[__c])
85
#define msa_st1_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __b[__c])
86
#define msa_st1_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
87
#define msa_st1_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
88
#define msa_st1q_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
89
#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
90
#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
91
#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
92
#define msa_st1q_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
93
#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
94
#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
95
#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
96
#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
97
#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
100
#define msa_dup_n_s8(__a) ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
101
#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
102
#define msa_dup_n_s32(__a) ((v2i32){__a, __a})
103
#define msa_dup_n_s64(__a) ((v1i64){__a})
104
#define msa_dup_n_u8(__a) ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
105
#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
106
#define msa_dup_n_u32(__a) ((v2u32){__a, __a})
107
#define msa_dup_n_u64(__a) ((v1u64){__a})
108
#define msa_dup_n_f32(__a) ((v2f32){__a, __a})
109
#define msa_dup_n_f64(__a) ((v1f64){__a})
112
#define msa_dupq_n_s8(__a) (__builtin_msa_fill_b((int32_t)(__a)))
113
#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
114
#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
115
#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
116
#define msa_dupq_n_u8(__a) ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
117
#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
118
#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
119
#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
120
#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
121
#define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
122
#define msa_dupq_lane_s8(__a, __b) (__builtin_msa_splat_b(__a, __b))
123
#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
124
#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
125
#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
126
#define msa_dupq_lane_u8(__a, __b) ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
127
#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
128
#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
129
#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
132
#define msa_create_s8(__a) ((v8i8)((uint64_t)(__a)))
133
#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
134
#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
135
#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
136
#define msa_create_u8(__a) ((v8u8)((uint64_t)(__a)))
137
#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
138
#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
139
#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
140
#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
141
#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
145
#define msa_movl_s8(__a) \
146
((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
147
(__a)[4], (__a)[5], (__a)[6], (__a)[7]})
150
#define msa_movl_u8(__a) \
151
((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
152
(__a)[4], (__a)[5], (__a)[6], (__a)[7]})
155
#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
158
#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
161
#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
164
#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
167
#define msa_movn_s16(__a) \
169
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
170
(v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
173
#define msa_movn_s32(__a) \
175
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
176
(v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
179
#define msa_movn_s64(__a) \
181
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
182
(v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
185
#define msa_movn_u16(__a) \
187
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
188
(v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
191
#define msa_movn_u32(__a) \
193
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
194
(v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
197
#define msa_movn_u64(__a) \
199
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
200
(v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
204
#define msa_qmovn_s16(__a) \
206
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
207
(v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
210
#define msa_qmovn_s32(__a) \
212
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
213
(v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
216
#define msa_qmovn_s64(__a) \
218
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
219
(v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
222
#define msa_qmovn_u16(__a) \
224
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
225
(v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
228
#define msa_qmovn_u32(__a) \
230
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
231
(v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
234
#define msa_qmovn_u64(__a) \
236
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
237
(v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
241
#define msa_qmovun_s16(__a) \
243
v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
244
v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
245
(v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
248
#define msa_qmovun_s32(__a) \
250
v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
251
v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
252
(v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
255
#define msa_qmovun_s64(__a) \
257
v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
258
v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
259
(v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
263
#define msa_shrn_n_s16(__a, __b) \
265
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
266
(v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
269
#define msa_shrn_n_s32(__a, __b) \
271
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
272
(v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
275
#define msa_shrn_n_s64(__a, __b) \
277
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
278
(v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
281
#define msa_shrn_n_u16(__a, __b) \
283
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
284
(v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
287
#define msa_shrn_n_u32(__a, __b) \
289
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
290
(v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
293
#define msa_shrn_n_u64(__a, __b) \
295
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
296
(v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
300
#define msa_rshrn_n_s16(__a, __b) \
302
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
303
(v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
306
#define msa_rshrn_n_s32(__a, __b) \
308
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
309
(v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
312
#define msa_rshrn_n_s64(__a, __b) \
314
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
315
(v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
318
#define msa_rshrn_n_u16(__a, __b) \
320
v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
321
(v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
324
#define msa_rshrn_n_u32(__a, __b) \
326
v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
327
(v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
330
#define msa_rshrn_n_u64(__a, __b) \
332
v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
333
(v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
337
#define msa_qrshrn_n_s16(__a, __b) \
339
v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
340
v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
341
(v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
344
#define msa_qrshrn_n_s32(__a, __b) \
346
v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
347
v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
348
(v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
351
#define msa_qrshrn_n_s64(__a, __b) \
353
v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
354
v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
355
(v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
358
#define msa_qrshrn_n_u16(__a, __b) \
360
v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
361
v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
362
(v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
365
#define msa_qrshrn_n_u32(__a, __b) \
367
v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
368
v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
369
(v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
372
#define msa_qrshrn_n_u64(__a, __b) \
374
v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
375
v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
376
(v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
381
#define msa_qrshrun_n_s16(__a, __b) \
383
v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
384
v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
385
(v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
388
#define msa_qrshrun_n_s32(__a, __b) \
390
v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
391
v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
392
(v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
395
#define msa_qrshrun_n_s64(__a, __b) \
397
v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
398
v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
399
(v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
403
#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
404
#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
405
#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
406
#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
407
#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
408
#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
411
#define msa_qpack_s16(__a, __b) \
412
(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
413
#define msa_qpack_s32(__a, __b) \
414
(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
415
#define msa_qpack_s64(__a, __b) \
416
(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
417
#define msa_qpack_u16(__a, __b) \
418
((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
419
#define msa_qpack_u32(__a, __b) \
420
((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
421
#define msa_qpack_u64(__a, __b) \
422
((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
425
#define msa_qpacku_s16(__a, __b) \
426
((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
427
(v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
428
#define msa_qpacku_s32(__a, __b) \
429
((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
430
(v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
431
#define msa_qpacku_s64(__a, __b) \
432
((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
433
(v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
436
#define msa_packr_s16(__a, __b, __c) \
437
(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
438
#define msa_packr_s32(__a, __b, __c) \
439
(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
440
#define msa_packr_s64(__a, __b, __c) \
441
(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
442
#define msa_packr_u16(__a, __b, __c) \
443
((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
444
#define msa_packr_u32(__a, __b, __c) \
445
((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
446
#define msa_packr_u64(__a, __b, __c) \
447
((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
450
#define msa_rpackr_s16(__a, __b, __c) \
451
(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
452
#define msa_rpackr_s32(__a, __b, __c) \
453
(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
454
#define msa_rpackr_s64(__a, __b, __c) \
455
(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
456
#define msa_rpackr_u16(__a, __b, __c) \
457
((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
458
#define msa_rpackr_u32(__a, __b, __c) \
459
((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
460
#define msa_rpackr_u64(__a, __b, __c) \
461
((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
464
#define msa_qrpackr_s16(__a, __b, __c) \
465
(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
466
(v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
467
#define msa_qrpackr_s32(__a, __b, __c) \
468
(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
469
(v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
470
#define msa_qrpackr_s64(__a, __b, __c) \
471
(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
472
(v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
473
#define msa_qrpackr_u16(__a, __b, __c) \
474
((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
475
(v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
476
#define msa_qrpackr_u32(__a, __b, __c) \
477
((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
478
(v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
479
#define msa_qrpackr_u64(__a, __b, __c) \
480
((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
481
(v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
484
#define msa_qrpackru_s16(__a, __b, __c) \
486
v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
487
v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
488
(v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
491
#define msa_qrpackru_s32(__a, __b, __c) \
493
v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
494
v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
495
(v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
498
#define msa_qrpackru_s64(__a, __b, __c) \
500
v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
501
v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
502
(v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
506
#define msa_minq_s8(__a, __b) (__builtin_msa_min_s_b(__a, __b))
507
#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
508
#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
509
#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
510
#define msa_minq_u8(__a, __b) ((v16u8)__builtin_msa_min_u_b(__a, __b))
511
#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
512
#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
513
#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
514
#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
515
#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
518
#define msa_maxq_s8(__a, __b) (__builtin_msa_max_s_b(__a, __b))
519
#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
520
#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
521
#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
522
#define msa_maxq_u8(__a, __b) ((v16u8)__builtin_msa_max_u_b(__a, __b))
523
#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
524
#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
525
#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
526
#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
527
#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
530
#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
534
#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
536
#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
538
#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
541
#define msa_pckev_s8(__a, __b) (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
542
#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
543
#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
544
#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
547
#define msa_pckod_s8(__a, __b) (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
548
#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
549
#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
550
#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
553
#define LANE_IMM0_1(x) (0b1 - ((x) & 0b1))
554
#define LANE_IMM0_3(x) (0b11 - ((x) & 0b11))
555
#define LANE_IMM0_7(x) (0b111 - ((x) & 0b111))
556
#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
558
#define LANE_IMM0_1(x) ((x) & 0b1)
559
#define LANE_IMM0_3(x) ((x) & 0b11)
560
#define LANE_IMM0_7(x) ((x) & 0b111)
561
#define LANE_IMM0_15(x) ((x) & 0b1111)
564
#define msa_get_lane_u8(__a, __b) ((uint8_t)(__a)[LANE_IMM0_7(__b)])
565
#define msa_get_lane_s8(__a, __b) ((int8_t)(__a)[LANE_IMM0_7(__b)])
566
#define msa_get_lane_u16(__a, __b) ((uint16_t)(__a)[LANE_IMM0_3(__b)])
567
#define msa_get_lane_s16(__a, __b) ((int16_t)(__a)[LANE_IMM0_3(__b)])
568
#define msa_get_lane_u32(__a, __b) ((uint32_t)(__a)[LANE_IMM0_1(__b)])
569
#define msa_get_lane_s32(__a, __b) ((int32_t)(__a)[LANE_IMM0_1(__b)])
570
#define msa_get_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
571
#define msa_get_lane_s64(__a, __b) ((int64_t)(__a)[LANE_IMM0_1(__b)])
572
#define msa_get_lane_u64(__a, __b) ((uint64_t)(__a)[LANE_IMM0_1(__b)])
573
#define msa_get_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
574
#define msa_getq_lane_u8(__a, imm0_15) ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
575
#define msa_getq_lane_s8(__a, imm0_15) ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
576
#define msa_getq_lane_u16(__a, imm0_7) ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
577
#define msa_getq_lane_s16(__a, imm0_7) ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
578
#define msa_getq_lane_u32(__a, imm0_3) __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
579
#define msa_getq_lane_s32 __builtin_msa_copy_s_w
580
#define msa_getq_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
581
#define msa_getq_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
583
#define msa_getq_lane_u64(__a, imm0_1) __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
584
#define msa_getq_lane_s64 __builtin_msa_copy_s_d
586
#define msa_getq_lane_u64(__a, imm0_1) ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
587
#define msa_getq_lane_s64(__a, imm0_1) ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
592
#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
594
#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1], \
595
((v2u32)(b))[0], ((v2u32)(b))[1]}))
599
#define msa_combine_s8(__a, __b) __COMBINE_64_64(v16i8, __a, __b)
602
#define msa_combine_s16(__a, __b) __COMBINE_64_64(v8i16, __a, __b)
605
#define msa_combine_s32(__a, __b) __COMBINE_64_64(v4i32, __a, __b)
608
#define msa_combine_s64(__a, __b) __COMBINE_64_64(v2i64, __a, __b)
611
#define msa_combine_f32(__a, __b) __COMBINE_64_64(v4f32, __a, __b)
614
#define msa_combine_u8(__a, __b) __COMBINE_64_64(v16u8, __a, __b)
617
#define msa_combine_u16(__a, __b) __COMBINE_64_64(v8u16, __a, __b)
620
#define msa_combine_u32(__a, __b) __COMBINE_64_64(v4u32, __a, __b)
623
#define msa_combine_u64(__a, __b) __COMBINE_64_64(v2u64, __a, __b)
626
#define msa_combine_f64(__a, __b) __COMBINE_64_64(v2f64, __a, __b)
630
#define __GET_LOW(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
631
#define __GET_HIGH(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
633
#define __GET_LOW(__TYPE, a) ((__TYPE)(((v2u64)(a))[0]))
634
#define __GET_HIGH(__TYPE, a) ((__TYPE)(((v2u64)(a))[1]))
638
#define msa_get_low_s8(__a) __GET_LOW(v8i8, __a)
641
#define msa_get_low_s16(__a) __GET_LOW(v4i16, __a)
644
#define msa_get_low_s32(__a) __GET_LOW(v2i32, __a)
647
#define msa_get_low_s64(__a) __GET_LOW(v1i64, __a)
650
#define msa_get_low_u8(__a) __GET_LOW(v8u8, __a)
653
#define msa_get_low_u16(__a) __GET_LOW(v4u16, __a)
656
#define msa_get_low_u32(__a) __GET_LOW(v2u32, __a)
659
#define msa_get_low_u64(__a) __GET_LOW(v1u64, __a)
662
#define msa_get_low_f32(__a) __GET_LOW(v2f32, __a)
665
#define msa_get_low_f64(__a) __GET_LOW(v1f64, __a)
668
#define msa_get_high_s8(__a) __GET_HIGH(v8i8, __a)
671
#define msa_get_high_s16(__a) __GET_HIGH(v4i16, __a)
674
#define msa_get_high_s32(__a) __GET_HIGH(v2i32, __a)
677
#define msa_get_high_s64(__a) __GET_HIGH(v1i64, __a)
680
#define msa_get_high_u8(__a) __GET_HIGH(v8u8, __a)
683
#define msa_get_high_u16(__a) __GET_HIGH(v4u16, __a)
686
#define msa_get_high_u32(__a) __GET_HIGH(v2u32, __a)
689
#define msa_get_high_u64(__a) __GET_HIGH(v1u64, __a)
692
#define msa_get_high_f32(__a) __GET_HIGH(v2f32, __a)
695
#define msa_get_high_f64(__a) __GET_HIGH(v1f64, __a)
699
#define msa_mulq_lane_f32(__a, __b, __lane) ((__a) * msa_getq_lane_f32(__b, __lane))
703
#define msa_mlaq_lane_f32(__a, __b, __c, __lane) ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
706
#define msa_sum_u16(__a) \
710
_b = __builtin_msa_hadd_u_w(__a, __a); \
711
_c = __builtin_msa_hadd_u_d(_b, _b); \
712
(uint16_t)(_c[0] + _c[1]); \
716
#define msa_sum_s16(__a) \
720
_b = __builtin_msa_hadd_s_w(__a, __a); \
721
_c = __builtin_msa_hadd_s_d(_b, _b); \
722
(int16_t)(_c[0] + _c[1]); \
727
#define msa_sum_u32(__a) \
730
_b = __builtin_msa_hadd_u_d(__a, __a); \
731
(uint32_t)(_b[0] + _b[1]); \
735
#define msa_sum_s32(__a) \
738
_b = __builtin_msa_hadd_s_d(__a, __a); \
739
(int32_t)(_b[0] + _b[1]); \
743
#define msa_sum_u8(__a) \
747
_b16 = __builtin_msa_hadd_u_h(__a, __a); \
748
_c32 = __builtin_msa_hadd_u_w(_b16, _b16); \
749
(uint8_t)msa_sum_u32(_c32); \
753
#define msa_sum_s8(__a) \
757
_b16 = __builtin_msa_hadd_s_h(__a, __a); \
758
_c32 = __builtin_msa_hadd_s_w(_b16, _b16); \
759
(int8_t)msa_sum_s32(_c32); \
763
#define msa_sum_f32(__a) ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
766
#define msa_paddlq_u8(__a) (__builtin_msa_hadd_u_h(__a, __a))
769
#define msa_paddlq_s8(__a) (__builtin_msa_hadd_s_h(__a, __a))
772
#define msa_paddlq_u16(__a) (__builtin_msa_hadd_u_w(__a, __a))
775
#define msa_paddlq_s16(__a) (__builtin_msa_hadd_s_w(__a, __a))
778
#define msa_paddlq_u32(__a) (__builtin_msa_hadd_u_d(__a, __a))
781
#define msa_paddlq_s32(__a) (__builtin_msa_hadd_s_d(__a, __a))
783
#define V8U8_2_V8U16(x) {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
784
(uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
785
#define V8U8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
786
(int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
787
#define V8I8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
788
(int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
789
#define V4U16_2_V4U32(x) {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
790
#define V4U16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
791
#define V4I16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
792
#define V2U32_2_V2U64(x) {(uint64_t)x[0], (uint64_t)x[1]}
793
#define V2U32_2_V2I64(x) {(int64_t)x[0], (int64_t)x[1]}
796
#define msa_mull_u8(__a, __b) ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
799
#define msa_mull_s8(__a, __b) (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
802
#define msa_mull_u16(__a, __b) ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
805
#define msa_mull_s16(__a, __b) (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
808
#define msa_mull_u32(__a, __b) ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
811
#define msa_andq_u8(__a, __b) ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
812
#define msa_andq_s8(__a, __b) ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
813
#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
814
#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
815
#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
816
#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
817
#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
818
#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
821
#define msa_orrq_u8(__a, __b) ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
822
#define msa_orrq_s8(__a, __b) ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
823
#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
824
#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
825
#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
826
#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
827
#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
828
#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
831
#define msa_eorq_u8(__a, __b) ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
832
#define msa_eorq_s8(__a, __b) ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
833
#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
834
#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
835
#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
836
#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
837
#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
838
#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
841
#define msa_mvnq_u8(__a) ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
842
#define msa_mvnq_s8(__a) ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
843
#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
844
#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
845
#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
846
#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
847
#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
848
#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
851
#define msa_ceqq_u8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
852
#define msa_ceqq_s8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
853
#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
854
#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
855
#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
856
#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
857
#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
858
#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
859
#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
860
#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
863
#define msa_cltq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
864
#define msa_cltq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
865
#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
866
#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
867
#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
868
#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
869
#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
870
#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
871
#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
872
#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
875
#define msa_cgtq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
876
#define msa_cgtq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
877
#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
878
#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
879
#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
880
#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
881
#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
882
#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
883
#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
884
#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
887
#define msa_cleq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
888
#define msa_cleq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
889
#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
890
#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
891
#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
892
#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
893
#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
894
#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
895
#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
896
#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
899
#define msa_cgeq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
900
#define msa_cgeq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
901
#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
902
#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
903
#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
904
#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
905
#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
906
#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
907
#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
908
#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
911
#define msa_shlq_u8(__a, __b) ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
912
#define msa_shlq_s8(__a, __b) ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
913
#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
914
#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
915
#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
916
#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
917
#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
918
#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
921
#define msa_shlq_n_u8(__a, __imm) ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
922
#define msa_shlq_n_s8(__a, __imm) ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
923
#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
924
#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
925
#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
926
#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
927
#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
928
#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
931
#define msa_shrq_u8(__a, __b) ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
932
#define msa_shrq_s8(__a, __b) ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
933
#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
934
#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
935
#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
936
#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
937
#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
938
#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
941
#define msa_shrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
942
#define msa_shrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
943
#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
944
#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
945
#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
946
#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
947
#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
948
#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
951
#define msa_rshrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
952
#define msa_rshrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
953
#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
954
#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
955
#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
956
#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
957
#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
958
#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
961
#define msa_qrshrq_s32(a, b) ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
964
#define msa_qaddq_u8 __builtin_msa_adds_u_b
965
#define msa_qaddq_s8 __builtin_msa_adds_s_b
966
#define msa_qaddq_u16 __builtin_msa_adds_u_h
967
#define msa_qaddq_s16 __builtin_msa_adds_s_h
968
#define msa_qaddq_u32 __builtin_msa_adds_u_w
969
#define msa_qaddq_s32 __builtin_msa_adds_s_w
970
#define msa_qaddq_u64 __builtin_msa_adds_u_d
971
#define msa_qaddq_s64 __builtin_msa_adds_s_d
972
#define msa_addq_u8(a, b) ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
973
#define msa_addq_s8 __builtin_msa_addv_b
974
#define msa_addq_u16(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
975
#define msa_addq_s16 __builtin_msa_addv_h
976
#define msa_addq_u32(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
977
#define msa_addq_s32 __builtin_msa_addv_w
978
#define msa_addq_f32 __builtin_msa_fadd_w
979
#define msa_addq_u64(a, b) ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
980
#define msa_addq_s64 __builtin_msa_addv_d
981
#define msa_addq_f64 __builtin_msa_fadd_d
982
#define msa_qsubq_u8 __builtin_msa_subs_u_b
983
#define msa_qsubq_s8 __builtin_msa_subs_s_b
984
#define msa_qsubq_u16 __builtin_msa_subs_u_h
985
#define msa_qsubq_s16 __builtin_msa_subs_s_h
986
#define msa_subq_u8(a, b) ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
987
#define msa_subq_s8 __builtin_msa_subv_b
988
#define msa_subq_u16(a, b) ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
989
#define msa_subq_s16 __builtin_msa_subv_h
990
#define msa_subq_u32(a, b) ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
991
#define msa_subq_s32 __builtin_msa_subv_w
992
#define msa_subq_f32 __builtin_msa_fsub_w
993
#define msa_subq_u64(a, b) ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
994
#define msa_subq_s64 __builtin_msa_subv_d
995
#define msa_subq_f64 __builtin_msa_fsub_d
996
#define msa_mulq_u8(a, b) ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
997
#define msa_mulq_s8(a, b) ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
998
#define msa_mulq_u16(a, b) ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
999
#define msa_mulq_s16(a, b) ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
1000
#define msa_mulq_u32(a, b) ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
1001
#define msa_mulq_s32(a, b) ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
1002
#define msa_mulq_u64(a, b) ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
1003
#define msa_mulq_s64(a, b) ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
1004
#define msa_mulq_f32 __builtin_msa_fmul_w
1005
#define msa_mulq_f64 __builtin_msa_fmul_d
1006
#define msa_divq_f32 __builtin_msa_fdiv_w
1007
#define msa_divq_f64 __builtin_msa_fdiv_d
1008
#define msa_dotp_s_h __builtin_msa_dotp_s_h
1009
#define msa_dotp_s_w __builtin_msa_dotp_s_w
1010
#define msa_dotp_s_d __builtin_msa_dotp_s_d
1011
#define msa_dotp_u_h __builtin_msa_dotp_u_h
1012
#define msa_dotp_u_w __builtin_msa_dotp_u_w
1013
#define msa_dotp_u_d __builtin_msa_dotp_u_d
1014
#define msa_dpadd_s_h __builtin_msa_dpadd_s_h
1015
#define msa_dpadd_s_w __builtin_msa_dpadd_s_w
1016
#define msa_dpadd_s_d __builtin_msa_dpadd_s_d
1017
#define msa_dpadd_u_h __builtin_msa_dpadd_u_h
1018
#define msa_dpadd_u_w __builtin_msa_dpadd_u_w
1019
#define msa_dpadd_u_d __builtin_msa_dpadd_u_d
1021
#define ILVRL_B2(RTYPE, in0, in1, low, hi) do { \
1022
low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1)); \
1023
hi = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1)); \
1025
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1026
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1027
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1028
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1029
#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1031
#define ILVRL_H2(RTYPE, in0, in1, low, hi) do { \
1032
low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1)); \
1033
hi = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1)); \
1035
#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1036
#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1037
#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
1038
#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1039
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1040
#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
1042
#define ILVRL_W2(RTYPE, in0, in1, low, hi) do { \
1043
low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1)); \
1044
hi = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1)); \
1046
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1047
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1048
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1049
#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
1052
#define msa_absq_s8(a) __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
1053
#define msa_absq_s16(a) __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
1054
#define msa_absq_s32(a) __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
1055
#define msa_absq_s64(a) __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
1056
#define msa_absq_f32(a) ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
1057
#define msa_absq_f64(a) ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
1058
#define msa_qabsq_s8(a) __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
1059
#define msa_qabsq_s16(a) __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
1060
#define msa_qabsq_s32(a) __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
1061
#define msa_qabsq_s64(a) __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
1064
#define msa_abdq_u8 __builtin_msa_asub_u_b
1065
#define msa_abdq_s8 __builtin_msa_asub_s_b
1066
#define msa_abdq_u16 __builtin_msa_asub_u_h
1067
#define msa_abdq_s16 __builtin_msa_asub_s_h
1068
#define msa_abdq_u32 __builtin_msa_asub_u_w
1069
#define msa_abdq_s32 __builtin_msa_asub_s_w
1070
#define msa_abdq_u64 __builtin_msa_asub_u_d
1071
#define msa_abdq_s64 __builtin_msa_asub_s_d
1072
#define msa_abdq_f32(a, b) msa_absq_f32(__builtin_msa_fsub_w(a, b))
1073
#define msa_abdq_f64(a, b) msa_absq_f64(__builtin_msa_fsub_d(a, b))
1074
#define msa_qabdq_s8(a, b) msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
1075
#define msa_qabdq_s16(a, b) msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
1076
#define msa_qabdq_s32(a, b) msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
1077
#define msa_qabdq_s64(a, b) msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
1080
#define msa_sqrtq_f32 __builtin_msa_fsqrt_w
1081
#define msa_sqrtq_f64 __builtin_msa_fsqrt_d
1082
#define msa_rsqrtq_f32 __builtin_msa_frsqrt_w
1083
#define msa_rsqrtq_f64 __builtin_msa_frsqrt_d
1087__extension__
extern
__inline v4i32
1088__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1089msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
1091
__asm__
volatile(
"maddv.w %w[__a], %w[__b], %w[__c]\n"
1095
: [__b]
"f"(__b), [__c]
"f"(__c));
1099__extension__
extern
__inline v2i64
1100__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1101msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
1103
__asm__
volatile(
"maddv.d %w[__a], %w[__b], %w[__c]\n"
1107
: [__b]
"f"(__b), [__c]
"f"(__c));
1111__extension__
extern
__inline v4f32
1112__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1113msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
1115
__asm__
volatile(
"fmadd.w %w[__a], %w[__b], %w[__c]\n"
1119
: [__b]
"f"(__b), [__c]
"f"(__c));
1123__extension__
extern
__inline v2f64
1124__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1125msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
1127
__asm__
volatile(
"fmadd.d %w[__a], %w[__b], %w[__c]\n"
1131
: [__b]
"f"(__b), [__c]
"f"(__c));
1136
#define msa_cntq_s8 __builtin_msa_pcnt_b
1137
#define msa_cntq_s16 __builtin_msa_pcnt_h
1138
#define msa_cntq_s32 __builtin_msa_pcnt_w
1139
#define msa_cntq_s64 __builtin_msa_pcnt_d
1142
#define msa_bslq_u8 __builtin_msa_bsel_v
1145
#define msa_ilvrq_s8 __builtin_msa_ilvr_b
1146
#define msa_ilvrq_s16 __builtin_msa_ilvr_h
1147
#define msa_ilvrq_s32 __builtin_msa_ilvr_w
1148
#define msa_ilvrq_s64 __builtin_msa_ilvr_d
1149
#define msa_ilvlq_s8 __builtin_msa_ilvl_b
1150
#define msa_ilvlq_s16 __builtin_msa_ilvl_h
1151
#define msa_ilvlq_s32 __builtin_msa_ilvl_w
1152
#define msa_ilvlq_s64 __builtin_msa_ilvl_d
1155
#define msa_ilvevq_s8 __builtin_msa_ilvev_b
1156
#define msa_ilvevq_s16 __builtin_msa_ilvev_h
1157
#define msa_ilvevq_s32 __builtin_msa_ilvev_w
1158
#define msa_ilvevq_s64 __builtin_msa_ilvev_d
1159
#define msa_ilvodq_s8 __builtin_msa_ilvod_b
1160
#define msa_ilvodq_s16 __builtin_msa_ilvod_h
1161
#define msa_ilvodq_s32 __builtin_msa_ilvod_w
1162
#define msa_ilvodq_s64 __builtin_msa_ilvod_d
1166
#define msa_extq_s8(a, b, c) \
1167
(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
1168
#define msa_extq_s16(a, b, c) \
1169
(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
1170
#define msa_extq_s32(a, b, c) \
1171
(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
1172
#define msa_extq_s64(a, b, c) \
1173
(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
1175
#define msa_extq_s8(a, b, c) \
1176
(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
1177
#define msa_extq_s16(a, b, c) \
1178
(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
1179
#define msa_extq_s32(a, b, c) \
1180
(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
1181
#define msa_extq_s64(a, b, c) \
1182
(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
1186
#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
1187
#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
1188
#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
1189
#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
1190
#define msa_cvttintq_u32_f32 __builtin_msa_ftint_u_w
1191
#define msa_cvttintq_s32_f32 __builtin_msa_ftint_s_w
1192
#define msa_cvttintq_u64_f64 __builtin_msa_ftint_u_d
1193
#define msa_cvttintq_s64_f64 __builtin_msa_ftint_s_d
1194
#define msa_cvtrintq_f32 __builtin_msa_frint_w
1195
#define msa_cvtrintq_f64 __builtin_msa_frint_d
1198
#define msa_cvtfintq_f32_u32 __builtin_msa_ffint_u_w
1199
#define msa_cvtfintq_f32_s32 __builtin_msa_ffint_s_w
1200
#define msa_cvtfintq_f64_u64 __builtin_msa_ffint_u_d
1201
#define msa_cvtfintq_f64_s64 __builtin_msa_ffint_s_d
1202
#define msa_cvtfq_f32_f64 __builtin_msa_fexdo_w
1203
#define msa_cvtflq_f64_f32 __builtin_msa_fexupr_d
1204
#define msa_cvtfhq_f64_f32 __builtin_msa_fexupl_d
1206
#define msa_addl_u8(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
1207
#define msa_addl_s8(a, b) (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
1208
#define msa_addl_u16(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
1209
#define msa_addl_s16(a, b) (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
1210
#define msa_subl_s16(a, b) (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
1211
#define msa_recpeq_f32 __builtin_msa_frcp_w
1212
#define msa_recpsq_f32(a, b) (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
1214
#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
1215
__extension__ extern __inline void \
1216
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1217
msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
1219
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1220
_Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
1221
*a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
1222
*b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
1224
__extension__ extern __inline void \
1225
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1226
msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
1228
msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
1229
msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
1232MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
1233MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
1234MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
1235MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
1236MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
1237MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
1238MSA_INTERLEAVED_IMPL_LOAD2_STORE2(
float, v4f32, v4i32, f32, w, 4)
1239MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
1240MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
1241MSA_INTERLEAVED_IMPL_LOAD2_STORE2(
double, v2f64, v2i64, f64, d, 2)
1244
#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
1245
__extension__ extern __inline void \
1246
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1247
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1249
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1250
_Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
1251
_Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
1252
_Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
1253
*a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1254
v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
1255
*b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1256
v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
1257
*c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1260
#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
1261
__extension__ extern __inline void \
1262
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1263
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1265
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1266
_Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
1267
_Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
1268
_Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
1269
*a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
1270
v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
1271
*b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
1272
v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
1273
*c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
1277MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
1278MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
1281
#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
1282
__extension__ extern __inline void \
1283
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1284
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1286
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1287
_Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
1288
_Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
1289
_Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
1290
*a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1291
v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
1292
*b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1293
v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
1294
*c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1297
#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
1298
__extension__ extern __inline void \
1299
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1300
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1302
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1303
_Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
1304
_Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
1305
_Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
1306
*a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
1307
v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
1308
*b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
1309
v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
1310
*c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
1314MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
1315MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
1317
#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
1318
__extension__ extern __inline void \
1319
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1320
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1322
_Tpv v00 = msa_ld1q_##suffix(ptr); \
1323
_Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
1324
_Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
1325
_Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
1326
_Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
1327
_Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
1328
*a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
1329
*b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
1330
*c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
1333MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
1334MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
1335MSA_INTERLEAVED_IMPL_LOAD3_32(
float, v4f32, v4i32, f32)
1337
#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
1338
__extension__ extern __inline void \
1339
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1340
msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1342
*((_Tp*)a) = *ptr; *((_Tp*)b) = *(ptr + 1); *((_Tp*)c) = *(ptr + 2); \
1343
*((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
1346MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
1347MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
1348MSA_INTERLEAVED_IMPL_LOAD3_64(
double, v2f64, f64)
1351
#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
1352
__extension__ extern __inline void \
1353
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1354
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1356
_Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
1357
_Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
1358
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1359
v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
1360
v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
1361
msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1362
v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
1363
v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
1364
msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
1367
#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
1368
__extension__ extern __inline void \
1369
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1370
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1372
_Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
1373
_Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
1374
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1375
v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
1376
v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
1377
msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1378
v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
1379
v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
1380
msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
1384MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
1385MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
1388
#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
1389
__extension__ extern __inline void \
1390
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1391
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1393
_Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
1394
_Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
1395
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1396
v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
1397
v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
1398
msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1399
v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
1400
v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
1401
msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1404
#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
1405
__extension__ extern __inline void \
1406
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1407
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1409
_Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
1410
_Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
1411
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1412
v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
1413
v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
1414
msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1415
v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
1416
v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
1417
msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1421MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
1422MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
1425
#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
1426
__extension__ extern __inline void \
1427
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1428
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1430
_Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
1431
_Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
1432
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1433
v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
1434
v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
1435
msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
1436
v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
1437
v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
1438
msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1441
#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
1442
__extension__ extern __inline void \
1443
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1444
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1446
_Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
1447
_Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
1448
msa_st1q_##suffix(ptr, (_Tpv)v1); \
1449
v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
1450
v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
1451
msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
1452
v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
1453
v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
1454
msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1458MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
1459MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
1460MSA_INTERLEAVED_IMPL_STORE3_32(
float, v4f32, v4i32, f32)
1462
#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
1463
__extension__ extern __inline void \
1464
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1465
msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1467
*ptr = a[0]; *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
1468
*(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
1471MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
1472MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
1473MSA_INTERLEAVED_IMPL_STORE3_64(
double, v2f64, f64)
1475
#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
1476
__extension__ extern __inline void \
1477
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1478
msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
1480
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1481
_Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
1482
_Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
1483
_Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
1484
_Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
1485
_Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
1486
_Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
1487
_Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
1488
*a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
1489
*b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
1490
*c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
1491
*d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
1493
__extension__ extern __inline void \
1494
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1495
msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
1497
_Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
1498
_Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
1499
_Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
1500
_Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
1501
msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
1502
msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
1503
msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
1504
msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
1507MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
1508MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
1509MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
1510MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
1511MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
1512MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
1513MSA_INTERLEAVED_IMPL_LOAD4_STORE4(
float, v4f32, v4i32, f32, w, 4)
1515
#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
1516
__extension__ extern __inline void \
1517
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1518
msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
1520
_Tpv v0 = msa_ld1q_##suffix(ptr); \
1521
_Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
1522
_Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
1523
_Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
1524
*a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
1525
*b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
1526
*c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
1527
*d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
1529
__extension__ extern __inline void \
1530
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1531
msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
1533
msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
1534
msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
1535
msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
1536
msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
1539MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
1540MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
1541MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(
double, v2f64, v2i64, f64)
1543__extension__
extern
__inline v8i16
1544__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1545msa_qdmulhq_n_s16(v8i16 a, int16_t b)
1548
ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
1549
return
msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
1550
msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);