42
#ifndef OPENCV_CORE_SSE_UTILS_HPP
43
#define OPENCV_CORE_SSE_UTILS_HPP
46
# error sse_utils.hpp header must be compiled as C++
49
#include "opencv2/core/cvdef.h"
56
inline
void
_mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
58
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
59
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
60
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
61
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
63
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
64
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
65
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
66
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
68
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
69
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
70
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
71
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
73
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
74
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
75
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
76
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
78
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
79
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
80
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
81
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
84
inline
void
_mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
85
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
87
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
88
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
89
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
90
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
91
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
92
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
94
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
95
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
96
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
97
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
98
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
99
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
101
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
102
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
103
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
104
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
105
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
106
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
108
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
109
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
110
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
111
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
112
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
113
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
115
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
116
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
117
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
118
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
119
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
120
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
123
inline
void
_mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
124
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
126
__m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
127
__m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
128
__m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
129
__m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
130
__m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
131
__m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
132
__m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
133
__m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
135
__m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
136
__m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
137
__m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
138
__m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
139
__m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
140
__m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
141
__m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
142
__m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
144
__m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
145
__m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
146
__m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
147
__m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
148
__m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
149
__m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
150
__m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
151
__m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
153
__m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
154
__m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
155
__m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
156
__m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
157
__m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
158
__m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
159
__m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
160
__m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
162
v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
163
v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
164
v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
165
v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
166
v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
167
v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
168
v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
169
v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
172
inline
void
_mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
174
__m128i v_mask = _mm_set1_epi16(0x00ff);
176
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
177
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
178
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
179
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
181
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
182
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
183
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
184
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
186
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
187
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
188
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
189
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
191
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
192
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
193
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
194
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
196
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
197
v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
198
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
199
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
202
inline
void
_mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
203
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
205
__m128i v_mask = _mm_set1_epi16(0x00ff);
207
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
208
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
209
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
210
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
211
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
212
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
214
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
215
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
216
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
217
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
218
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
219
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
221
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
222
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
223
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
224
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
225
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
226
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
228
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
229
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
230
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
231
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
232
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
233
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
235
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
236
v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
237
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
238
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
239
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
240
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
243
inline
void
_mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
244
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
246
__m128i v_mask = _mm_set1_epi16(0x00ff);
248
__m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
249
__m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
250
__m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
251
__m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
252
__m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
253
__m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
254
__m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
255
__m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
257
__m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
258
__m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
259
__m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
260
__m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
261
__m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
262
__m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
263
__m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
264
__m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
266
__m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
267
__m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
268
__m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
269
__m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
270
__m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
271
__m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
272
__m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
273
__m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
275
__m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
276
__m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
277
__m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
278
__m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
279
__m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
280
__m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
281
__m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
282
__m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
284
v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
285
v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
286
v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
287
v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
288
v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
289
v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
290
v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
291
v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
294
inline
void
_mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
296
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
297
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
298
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
299
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
301
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
302
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
303
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
304
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
306
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
307
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
308
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
309
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
311
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
312
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
313
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
314
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
317
inline
void
_mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
318
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
320
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
321
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
322
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
323
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
324
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
325
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
327
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
328
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
329
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
330
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
331
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
332
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
334
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
335
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
336
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
337
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
338
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
339
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
341
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
342
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
343
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
344
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
345
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
346
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
349
inline
void
_mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
350
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
352
__m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
353
__m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
354
__m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
355
__m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
356
__m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
357
__m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
358
__m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
359
__m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
361
__m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
362
__m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
363
__m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
364
__m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
365
__m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
366
__m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
367
__m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
368
__m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
370
__m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
371
__m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
372
__m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
373
__m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
374
__m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
375
__m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
376
__m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
377
__m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
379
v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
380
v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
381
v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
382
v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
383
v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
384
v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
385
v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
386
v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
391
inline
void
_mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
393
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
395
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
396
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
397
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
398
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
400
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
401
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
402
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
403
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
405
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
406
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
407
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
408
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
410
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
411
v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
412
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
413
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
416
inline
void
_mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
417
__m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
419
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
421
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
422
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
423
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
424
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
425
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
426
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
428
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
429
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
430
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
431
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
432
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
433
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
435
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
436
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
437
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
438
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
439
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
440
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
442
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
443
v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
444
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
445
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
446
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
447
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
450
inline
void
_mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
451
__m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
453
__m128i v_mask = _mm_set1_epi32(0x0000ffff);
455
__m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
456
__m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
457
__m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
458
__m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
459
__m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
460
__m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
461
__m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
462
__m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
464
__m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
465
__m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
466
__m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
467
__m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
468
__m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
469
__m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
470
__m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
471
__m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
473
__m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
474
__m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
475
__m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
476
__m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
477
__m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
478
__m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
479
__m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
480
__m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
482
v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
483
v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
484
v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
485
v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
486
v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
487
v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
488
v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
489
v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
494
inline
void
_mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
496
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
497
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
498
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
499
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
501
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
502
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
503
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
504
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
506
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
507
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
508
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
509
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
512
inline
void
_mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
513
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
515
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
516
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
517
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
518
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
519
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
520
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
522
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
523
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
524
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
525
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
526
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
527
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
529
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
530
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
531
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
532
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
533
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
534
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
537
inline
void
_mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
538
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
540
__m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
541
__m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
542
__m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
543
__m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
544
__m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
545
__m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
546
__m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
547
__m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
549
__m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
550
__m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
551
__m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
552
__m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
553
__m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
554
__m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
555
__m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
556
__m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
558
v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
559
v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
560
v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
561
v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
562
v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
563
v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
564
v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
565
v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
568
inline
void
_mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
570
enum
{ mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
572
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
573
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
574
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
575
__m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
577
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
578
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
579
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
580
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
582
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
583
v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
584
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
585
v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
588
inline
void
_mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
589
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
591
enum
{ mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
593
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
594
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
595
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
596
__m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
597
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
598
__m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
600
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
601
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
602
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
603
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
604
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
605
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
607
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
608
v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
609
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
610
v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
611
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
612
v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
615
inline
void
_mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
616
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
618
enum
{ mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
620
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
621
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
622
__m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
623
__m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
624
__m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
625
__m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
626
__m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
627
__m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
629
__m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
630
__m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
631
__m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
632
__m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
633
__m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
634
__m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
635
__m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
636
__m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
638
v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
639
v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
640
v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
641
v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
642
v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
643
v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
644
v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
645
v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);