Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector float API ***************************************************/
6 :
7 : /* A vf_t is a vector where each 32-bit wide lane holds a single
8 : precision IEEE 754 floating point value (a "float").
9 :
10 : Inputs to all operations assume that the values aren't exotic (no
11 : NaNs, no +/-Infs, no denorms) and, if the output of an operation
12 : would produce an exotic value in the IEEE 754 standard, the results
13 : of that operation are undefined. Additionally, correct handling of
14 : signed zero is not guaranteed. Lastly, these will not raise floating
15 : point exceptions or set math errno's.
16 :
17 : Basically, handling of exotics and signed zero will generally be
18 : reasonable but most of that relies on the underlying compiler and
19 : hardware having conformant behavior and this is flaky at the best of
20 : times. So it is best for developers not to assume conformant
21 : behavior.
22 :
23 : These mirror the other APIs as much as possible. Macros are
24 : preferred over static inlines when it is possible to do it robustly
25 : to reduce the risk of the compiler mucking it up. */
26 :
27 18219059 : #define vf_t __m128
28 :
29 : /* Constructors */
30 :
31 : /* Given the float values, return ... */
32 :
33 1179648 : #define vf(f0,f1,f2,f3) _mm_setr_ps( (f0), (f1), (f2), (f3) ) /* [ f0 f1 f2 f3 ] */
34 :
35 262144 : #define vf_bcast(f0) _mm_set1_ps( (f0) ) /* [ f0 f0 f0 f0 ] */
36 :
37 : static inline vf_t /* [ f0 f1 f0 f1 ] */
38 196608 : vf_bcast_pair( float f0, float f1 ) {
39 196608 : return _mm_setr_ps( f0, f1, f0, f1 );
40 196608 : }
41 :
42 : static inline vf_t /* [ f0 f0 f1 f1 ] */
43 196608 : vf_bcast_wide( float f0, float f1 ) {
44 196608 : return _mm_setr_ps( f0, f0, f1, f1 );
45 196608 : }
46 :
47 : /* vf_permute returns [ f(imm_i0) f(imm_i1) f(imm_i2) f(imm_i3) ].
48 : imm_i* should be compile time constants in 0:3. */
49 :
50 : #define vf_permute(f,imm_i0,imm_i1,imm_i2,imm_i3) _mm_permute_ps( (f), _MM_SHUFFLE( (imm_i3), (imm_i2), (imm_i1), (imm_i0) ) )
51 :
52 : /* Predefined constants */
53 :
54 : #define vf_zero() _mm_setzero_ps() /* Return [ 0.f 0.f 0.f 0.f ] */
55 9830451 : #define vf_one() _mm_set1_ps( 1.f ) /* Return [ 1.f 1.f 1.f 1.f ] */
56 :
57 : /* Memory operations */
58 :
59 : /* vf_ld return the 4 floats at the 16-byte aligned / 16-byte sized
60 : location p as a vector float. vf_ldu is the same but p does not have
61 : to be aligned. vf_st writes the vector float to the 16-byte aligned
62 : / 16-byte sized location p as 4 floats. vf_stu is the same but p
63 : does not have to be aligned. In all these lane l will be at p[l].
64 : FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
65 :
66 9830451 : #define vf_ld(p) _mm_load_ps( (p) )
67 39321804 : #define vf_ldu(p) _mm_loadu_ps( (p) )
68 9830451 : #define vf_st(p,x) _mm_store_ps( (p), (x) )
69 39321804 : #define vf_stu(p,x) _mm_storeu_ps( (p), (x) )
70 :
71 : /* vf_ldif is an optimized equivalent to vf_notczero(c,vf_ldu(p)) (may
72 : have different behavior if c is not a proper vector conditional). It
73 : is provided for symmetry with the vf_stif operation. vf_stif stores
74 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
75 : Undefined behavior if c is not a proper vector conditional. */
76 :
77 : #define vf_ldif(c,p) _mm_maskload_ps( (p),(c))
78 : #define vf_stif(c,p,x) _mm_maskstore_ps((p),(c),(x))
79 :
80 : /* Element operations */
81 :
82 : /* vf_extract extracts the float in lane imm from the vector float
83 : as a float. vf_insert returns the vector float formed by replacing
84 : the value in lane imm of a with the provided float. imm should be a
85 : compile time constant in 0:3. vf_extract_variable and
86 : vf_insert_variable are the slower but the lane n does not have to be
87 : known at compile time (should still be in 0:3). */
88 :
89 : /* FIXME: ARE THESE BETTER IMPLEMENTED VIA BOUNCING OF THE STACK? (IT
90 : SEEMS PRETTY CLEAR THAT INTEL DIDN'T INTEND THIS TO BE POSSIBLE) */
91 :
92 39321804 : #define vf_extract(a,imm) _mm_cvtss_f32( _mm_permute_ps( (a), _MM_SHUFFLE(3,2,1,(imm)) ) )
93 :
94 : #define vf_insert(a,imm,v) \
95 40108236 : _mm_castsi128_ps( _mm_insert_epi32( _mm_castps_si128( (a) ), \
96 40108236 : _mm_extract_epi32( _mm_castps_si128( _mm_set_ss( (v) ) ), 0 ), (imm) ) )
97 :
98 : static inline float
99 39321804 : vf_extract_variable( vf_t a, int n ) {
100 39321804 : float f[4] V_ATTR;
101 39321804 : _mm_store_ps( f, a );
102 39321804 : return f[n];
103 39321804 : }
104 :
105 : static inline vf_t
106 39321804 : vf_insert_variable( vf_t a, int n, float v ) {
107 39321804 : float f[4] V_ATTR;
108 39321804 : _mm_store_ps( f, a );
109 39321804 : f[n] = v;
110 39321804 : return _mm_load_ps( f );
111 39321804 : }
112 :
113 : /* Given [a0 a1 a2 a3], [b0 b1 b2 b3] and/or [c0 c1 c2 c3], return ... */
114 :
115 : /* Arithmetic operations */
116 :
117 : /* vf_neg(a) returns [ -a0 -a1 ... -a3 ] (i.e. -a )
118 : vf_sign(a) returns [ signf(a0) signf(a1) ... signf(a3) ]
119 : vf_abs(a) returns [ fabsf(a0) fabsf(a1) ... fabsf(a3) ] (i.e. abs(a))
120 : vf_negabs(a) returns [ -fabsf(a0) -fabsf(a1) ... -fabsf(a3) ] (i.e. -abs(a))
121 : vf_ceil(a) returns [ ceilf(a0) ceilf(a1) ... ceilf(a3) ] (i.e. ceil(a))
122 : vf_floor(a) returns [ floorf(a0) floorf(a1) ... floorf(a3) ] (i.e. floor(a))
123 : vf_rint(a) returns [ rintf(a0) rintf(a1) ... rintf(a3) ] (i.e. roundb(a))
124 : vf_trunc(a) returns [ truncf(a0) truncf(a1) ... truncf(a3) ] (i.e. fix(a))
125 : vf_sqrt(a) returns [ sqrtf(a0) sqrtf(a1) ... sqrtf(a3) ] (i.e. sqrt(a))
126 : vf_rcp_fast(a) returns [ ~rcpf(a0) ~rcpf(a1) ... ~rcpf(a3) ]
127 : vf_rsqrt_fast(a) returns [ ~rsqrtf(a0) ~rsqrtf(a1) ... ~rsqrtf(a3) ]
128 :
129 : vf_add(a,b) returns [ a0+b0 a1+b1 ... a3+b3 ] (i.e. a +b)
130 : vf_sub(a,b) returns [ a0-b0 a1-b1 ... a3-b3 ] (i.e. a -b)
131 : vf_mul(a,b) returns [ a0*b0 a1*b1 ... a3*b3 ] (i.e. a.*b)
132 : vf_div(a,b) returns [ a0/b0 a1/b1 ... a3/b3 ] (i.e. a./b)
133 : vf_min(a,b) returns [ fminf(a0,b0) fminf(a1,b1) ... fminf(a3,b3) ] (i.e. min([a;b]) (a and b are 1x4)
134 : vf_max(a,b) returns [ fmaxf(a0,b0) fmaxf(a1,b1) ... fmaxf(a3,b3) ] (i.e. max([a;b]) (a and b are 1x4)
135 : vf_copysign(a,b) returns [ copysignf(a0,b0) copysignf(a1,b1) ... copysignf(a3,b3) ]
136 : vf_flipsign(a,b) returns [ flipsignf(a0,b0) flipsignf(a1,b1) ... flipsignf(a3,b3) ]
137 :
138 : vf_fma(a,b,c) returns [ fmaf(a0,b0, c0) fmaf(a1,b1, c1) ... fmaf(a3,b3, c3) ] (i.e. a.*b+c)
139 : vf_fms(a,b,c) returns [ fmaf(a0,b0,-c0) fmaf(a1,b1,-c1) ... fmaf(a3,b3,-c3) ] (i.e. a.*b-c)
140 : vf_fnma(a,b,c) returns [ -fmaf(a0,b0,-c0) -fmaf(a1,b1,-c1) ... -fmaf(a3,b3,-c3) ] (i.e. -a.*b+c)
141 :
142 : where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
143 : is 1./a and rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
144 : signbit is set and a otherwise.
145 :
146 : rint is in round-to-nearest-even rounding mode (note rint and
147 : nearbyint are identical once floating point exceptions are ignored).
148 :
149 : sqrt should typically be full accuracy.
150 :
151 : rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
152 : accurate (~3 or more decimal digits) such that (nearly) full accuracy
153 : can be achieved with two to three rounds of Newton-Raphson polishing.
154 : Bit level replicable code should avoid rcp_fast and rsqrt_fast though
155 : as the approximations used can vary between various generations /
156 : steppings / microcode updates of x86 processors (including Intel and
157 : AMD). */
158 :
159 : #define vf_neg(a) _mm_xor_ps( _mm_set1_ps( -0.f ), (a) )
160 : #define vf_sign(a) _mm_xor_ps( _mm_set1_ps( 1.f ), _mm_and_ps( _mm_set1_ps( -0.f ), (a) ) )
161 : #define vf_abs(a) _mm_andnot_ps( _mm_set1_ps( -0.f ), (a) )
162 : #define vf_negabs(a) _mm_or_ps( _mm_set1_ps( -0.f ), (a) )
163 : #define vf_ceil(a) _mm_ceil_ps( (a) )
164 : #define vf_floor(a) _mm_floor_ps( (a) )
165 : #define vf_rint(a) _mm_round_ps( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
166 : #define vf_trunc(a) _mm_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC )
167 : #define vf_sqrt(a) _mm_sqrt_ps( (a) )
168 : #define vf_rcp_fast(a) _mm_rcp_ps( (a) )
169 : #define vf_rsqrt_fast(a) _mm_rsqrt_ps( (a) )
170 :
171 : #define vf_add(a,b) _mm_add_ps( (a), (b) )
172 262144 : #define vf_sub(a,b) _mm_sub_ps( (a), (b) )
173 : #define vf_mul(a,b) _mm_mul_ps( (a), (b) )
174 : #define vf_div(a,b) _mm_div_ps( (a), (b) )
175 : #define vf_min(a,b) _mm_min_ps( (a), (b) )
176 : #define vf_max(a,b) _mm_max_ps( (a), (b) )
177 : #define vf_copysign(a,b) _mm_or_ps( _mm_andnot_ps( _mm_set1_ps( -0.f ), (a) ), _mm_and_ps( _mm_set1_ps( -0.f ), (b) ) )
178 : #define vf_flipsign(a,b) _mm_xor_ps( (a), _mm_and_ps( _mm_set1_ps( -0.f ), (b) ) )
179 :
180 : #if defined(__FMA__)
181 : #define vf_fma(a,b,c) _mm_fmadd_ps( (a), (b), (c) )
182 : #define vf_fms(a,b,c) _mm_fmsub_ps( (a), (b), (c) )
183 : #define vf_fnma(a,b,c) _mm_fnmadd_ps( (a), (b), (c) )
184 : #endif
185 :
186 : /* Binary operations */
187 :
188 : /* Note: binary operations are not well defined on vector floats.
189 : If doing tricks with floating point binary representations, the user
190 : should use vf_to_vi_raw as necessary. */
191 :
192 : /* Logical operations */
193 :
194 : /* These all return proper vector conditionals */
195 :
196 : #define vf_lnot(a) _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_EQ_OQ ) ) /* [ !a0 !a1 ... !a3 ] */
197 : #define vf_lnotnot(a) _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_NEQ_OQ ) ) /* [ !!a0 !!a1 ... !!a3 ] */
198 : #define vf_signbit(a) _mm_srai_epi32( _mm_castps_si128( (a) ), 31 ) /* [ signbit(a0) signbit(a1) ... signbit(a3) ] */
199 :
200 : #define vf_eq(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_EQ_OQ ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
201 : #define vf_gt(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_GT_OQ ) ) /* [ a0> b0 a1> b1 ... a3> b3 ] */
202 262144 : #define vf_lt(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_LT_OQ ) ) /* [ a0< b0 a1< b1 ... a3< b3 ] */
203 : #define vf_ne(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
204 : #define vf_ge(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_GE_OQ ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
205 : #define vf_le(a,b) _mm_castps_si128( _mm_cmp_ps( (a), (b), _CMP_LE_OQ ) ) /* [ a0==b0 a1==b1 ... a3==b3 ] */
206 :
207 : /* Conditional operations */
208 :
209 : #define vf_czero(c,f) _mm_andnot_ps( _mm_castsi128_ps( (c) ), (f) ) /* [ c0?0.f:f0 c1?0.f:f1 ... c3?0.f:f3 ] */
210 : #define vf_notczero(c,f) _mm_and_ps( _mm_castsi128_ps( (c) ), (f) ) /* [ c0?f0:0.f c1?f1:0.f ... c3?f3:0.f ] */
211 :
212 262144 : #define vf_if(c,t,f) _mm_blendv_ps( (f), (t), _mm_castsi128_ps( (c) ) ) /* [ c0?t0:f0 c1?t1:f1 ... c3?t3:f3 ] */
213 :
214 : /* Conversion operations */
215 :
216 : /* Summarizing:
217 :
218 : vf_to_vc(a) returns [ !!a0 !!a1 ... !!a3 ]
219 :
220 : vf_to_vi(a) returns [ (int)a0 (int)a1 ... (int)a3 ]
221 : vf_to_vi_fast(a) returns [ (int)rintf(a0) (int)rintf(a1) ... (int)rintf(a3) ]
222 :
223 : vf_to_vu(a) returns [ (uint)a0 (uint)a1 ... (uint)a3 ]
224 : vf_to_vu_fast(a) returns [ (uint)rintf(a0) (uint)rintf(a1) ... (uint)rintf(a3) ]
225 :
226 : vf_to_vd(a,imm_i0,imm_i1) returns [ (double)a(imm_i0) (double)a(imm_i1) ]
227 :
228 : vf_to_vl(a,imm_i0,imm_i1) returns [ (long)a(imm_i0) (long)a(imm_i1) ]
229 :
230 : vf_to_vv(a,imm_i0,imm_i1) returns [ (ulong)a(imm_i0) (ulong)a(imm_i1) ]
231 :
232 : where rintf is configured for round-to-nearest-even rounding (Intel
233 : architecture defaults to round-nearest-even here ... sigh, they still
234 : don't fully get it) and imm_i* should be a compile time constant in
235 : 0:3. That is, the fast variants assume that float point inputs are
236 : already integral value in the appropriate range for the output type.
237 :
238 : The raw variants return just raw bits as the corresponding vector
239 : type. vf_to_vi_raw in particular allows doing advanced bit tricks on
240 : a vector float. The others are probably dubious but are provided for
241 : completeness. */
242 :
243 : #define vf_to_vc(a) _mm_castps_si128( _mm_cmp_ps( (a), _mm_setzero_ps(), _CMP_NEQ_OQ ) )
244 : #define vf_to_vi(a) vf_to_vi_fast( _mm_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
245 : #define vf_to_vu(a) vf_to_vu_fast( _mm_round_ps( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) )
246 : #define vf_to_vd(a,imm_i0,imm_i1) _mm_cvtps_pd( _mm_permute_ps( (a), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0)) ) )
247 :
248 : #define vf_to_vl(f,imm_i0,imm_i1) (__extension__({ \
249 : vf_t _vf_to_vl_tmp = (f); \
250 : _mm_set_epi64x( (long)vf_extract( _vf_to_vl_tmp, (imm_i1) ), (long)vf_extract( _vf_to_vl_tmp, (imm_i0) ) ); /* sigh */ \
251 : }))
252 :
253 : #define vf_to_vv(f,imm_i0,imm_i1) (__extension__({ \
254 : vf_t _vf_to_vv_tmp = (f); \
255 : _mm_set_epi64x( (long)(ulong)vf_extract( _vf_to_vv_tmp, (imm_i1) ), \
256 : (long)(ulong)vf_extract( _vf_to_vv_tmp, (imm_i0) ) ); /* sigh */ \
257 : }))
258 :
259 : #define vf_to_vi_fast(a) _mm_cvtps_epi32( (a) )
260 :
261 : /* Note: Given that _mm_cvtps_epi32 existed for a long time, Intel
262 : clearly had the hardware under the hood for _mm_cvtps_epu32 but
263 : didn't bother to expose it pre-Skylake-X ... sigh (all too typical
264 : unfortunately). We use _mm_cvtps_epu32 where supported because it
265 : is faster and it replicates the same IB behaviors as the compiler
266 : generated scalar ASM for float to uint casts on these targets.
267 :
268 : Pre-Skylake-X, we emulate it by noting that subtracting 2^31 from
269 : a float holding an integer in [2^31,2^32) is exact and the result
270 : can be exactly converted to a signed integer by _mm_cvtps_epi32.
271 : We then use twos complement hacks to add back any shift. This also
272 : replicates the compiler's IB behaviors on these ISAs for float to
273 : int casts. */
274 :
275 : #if defined(__AVX512F__) && defined(__AVX512VL__)
276 : #define vf_to_vu_fast( a ) _mm_cvtps_epu32( (a) )
277 : #else
278 262144 : static inline __m128i vf_to_vu_fast( vf_t a ) { /* FIXME: workaround vu_t isn't declared at this point */
279 : /**/ /* Assumes a is integer in [0,2^32) */
280 262144 : vf_t s = vf_bcast( (float)(1U<<31) ); /* 2^31 */
281 262144 : vc_t c = vf_lt ( a, s ); /* -1 if a<2^31, 0 o.w. */
282 262144 : vf_t as = vf_sub( a, s ); /* a-2^31 */
283 262144 : __m128i u = _mm_cvtps_epi32( vf_if( c, a, as ) ); /* (uint)(a if a<2^31, a-2^31 o.w.) */
284 262144 : __m128i us = _mm_add_epi32( u, _mm_set1_epi32( (int)(1U<<31) ) ); /* (uint)(a+2^31 if a<2^31, a o.w.) */
285 262144 : return _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( us ), _mm_castsi128_ps( u ), _mm_castsi128_ps( c ) ) );
286 262144 : }
287 : #endif
288 :
289 : #define vf_to_vc_raw(a) _mm_castps_si128( (a) )
290 : #define vf_to_vi_raw(a) _mm_castps_si128( (a) )
291 : #define vf_to_vu_raw(a) _mm_castps_si128( (a) )
292 : #define vf_to_vd_raw(a) _mm_castps_pd( (a) )
293 : #define vf_to_vl_raw(a) _mm_castps_si128( (a) )
294 : #define vf_to_vv_raw(a) _mm_castps_si128( (a) )
295 :
296 : /* Reduction operations */
297 :
298 : static inline vf_t
299 196608 : vf_sum_all( vf_t x ) { /* Returns vf_bcast( sum( x ) ) */
300 196608 : x = _mm_hadd_ps( x, x ); /* x01 x23 ... */
301 196608 : return _mm_hadd_ps( x, x ); /* xsum ... */
302 196608 : }
303 :
304 : static inline vf_t
305 196608 : vf_min_all( vf_t x ) { /* Returns vf_bcast( min( x ) ) */
306 196608 : __m128 y;
307 196608 : y = _mm_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
308 196608 : x = _mm_min_ps( x, y ); /* x02 x13 ... */
309 196608 : y = _mm_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
310 196608 : x = _mm_min_ps( x, y ); /* xmin ... */
311 196608 : return x;
312 196608 : }
313 :
314 : static inline vf_t
315 196608 : vf_max_all( vf_t x ) { /* Returns vf_bcast( max( x ) ) */
316 196608 : __m128 y;
317 196608 : y = _mm_permute_ps( x, _MM_SHUFFLE( 1, 0, 3, 2 ) ); /* x2 x3 x0 x1 */
318 196608 : x = _mm_max_ps( x, y ); /* x02 x13 ... */
319 196608 : y = _mm_permute_ps( x, _MM_SHUFFLE( 2, 3, 0, 1 ) ); /* x13 x02 ... */
320 196608 : x = _mm_max_ps( x, y ); /* xmax ... */
321 196608 : return x;
322 196608 : }
323 :
324 : /* Misc operations */
325 :
326 : /* vf_gather(b,i) returns [ b[i(0)] b[i(1)] ... b[i(3)] ] where b is a
327 : "float const *" and i is a vi_t. */
328 :
329 9830451 : #define vf_gather(b,i) _mm_i32gather_ps( (b), (i), 4 )
330 :
331 : /* vf_transpose_4x4 transposes the 4x4 matrix stored in vf_t r0,r1,r2,r3
332 : and stores the result in 4x4 matrix vf_t c0,c1,c2,c3. All
333 : c0,c1,c2,c3 should be different for a well defined result.
334 : Otherwise, in-place operation and/or using the same vf_t to specify
335 : multiple rows of r is fine. */
336 :
337 196608 : #define vf_transpose_4x4( r0,r1,r2,r3, c0,c1,c2,c3 ) do { \
338 196608 : vf_t _vf_transpose_r0 = (r0); vf_t _vf_transpose_r1 = (r1); vf_t _vf_transpose_r2 = (r2); vf_t _vf_transpose_r3 = (r3); \
339 196608 : vf_t _vf_transpose_t; \
340 196608 : /* Transpose 2x2 blocks */ \
341 196608 : _vf_transpose_t = _vf_transpose_r0; _vf_transpose_r0 = _mm_unpacklo_ps( _vf_transpose_t, _vf_transpose_r2 ); \
342 196608 : /**/ _vf_transpose_r2 = _mm_unpackhi_ps( _vf_transpose_t, _vf_transpose_r2 ); \
343 196608 : _vf_transpose_t = _vf_transpose_r1; _vf_transpose_r1 = _mm_unpacklo_ps( _vf_transpose_t, _vf_transpose_r3 ); \
344 196608 : /**/ _vf_transpose_r3 = _mm_unpackhi_ps( _vf_transpose_t, _vf_transpose_r3 ); \
345 196608 : /* Transpose 1x1 blocks */ \
346 196608 : /**/ (c0) = _mm_unpacklo_ps( _vf_transpose_r0, _vf_transpose_r1 ); \
347 196608 : /**/ (c1) = _mm_unpackhi_ps( _vf_transpose_r0, _vf_transpose_r1 ); \
348 196608 : /**/ (c2) = _mm_unpacklo_ps( _vf_transpose_r2, _vf_transpose_r3 ); \
349 196608 : /**/ (c3) = _mm_unpackhi_ps( _vf_transpose_r2, _vf_transpose_r3 ); \
350 196608 : } while(0)
|