Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_sse_h
2 : #error "Do not include this directly; use fd_sse.h"
3 : #endif
4 :
5 : /* Vector double API **************************************************/
6 :
7 : /* A vd_t is a vector where each adjacent pair of 32-bit wide lanes
8 : (e.g. 0-1 / 2-3) hold a double precision IEEE 754 floating point
9 : value (a "double").
10 :
11 : Inputs to all operations assume that the values aren't exotic (no
12 : NaNs, no +/-Infs, no denorms) and, if the output of an operation
13 : would produce an exotic value in the IEEE 754 standard, the results
14 : of that operation are undefined. Additionally, correct handling of
15 : signed zero is not guaranteed. Lastly, these will not raise floating
16 : point exceptions or set math errno's.
17 :
18 : Basically, handling of exotics and signed zero will generally be
19 : reasonable but most of that relies on the underlying compiler and
20 : hardware having conformant behavior and this is flaky at the best of
21 : times. So it is best for developers not to assume conformant
22 : behavior.
23 :
24 : Note that, when doing conditional operations on a vector double, the
25 : vector conditional should have identical values in adjacent pairs of
26 : lanes. Results are undefined otherwise.
27 :
28 : These mirror the other APIs as much as possible. Macros are
29 : preferred over static inlines when it is possible to do it robustly
30 : to reduce the risk of the compiler mucking it up. */
31 :
32 19922995 : #define vd_t __m128d
33 :
34 : /* Constructors */
35 :
36 : /* Given the double values, return ... */
37 :
38 1179648 : #define vd(d0,d1) _mm_setr_pd( (d0), (d1) ) /* [ d0 d1 ] */
39 :
40 524288 : #define vd_bcast(d0) _mm_set1_pd( (d0) ) /* [ d0 d0 ] */
41 :
42 : /* vd_permute returns [ d(imm_i0) d(imm_i1) ]. imm_i* should be compile
43 : time constants in 0:1. */
44 :
45 : #define vd_permute( d, imm_i0, imm_i1 ) _mm_permute_pd( (d), (imm_i0) + 2*(imm_i1) )
46 :
47 : /* Predefined constants */
48 :
49 17104947 : #define vd_zero() _mm_setzero_pd() /* Return [ 0. 0. ] */
50 17104947 : #define vd_one() _mm_set1_pd(1.) /* Return [ 1. 1. ] */
51 :
52 : /* Memory operations */
53 :
54 : /* vd_ld return the 2 doubles at the 16-byte aligned / 16-byte sized
55 : location p as a vector double. vd_ldu is the same but p does not have
56 : to be aligned. vd_st writes the vector double to the 16-byte aligned
57 : / 16-byte sized location p as 2 doubles. vd_stu is the same but p
58 : does not have to be aligned. In all these 64-bit lane l will be at
59 : p[l]. FIXME: USE ATTRIBUTES ON P PASSED TO THESE? */
60 :
61 17104947 : #define vd_ld(p) _mm_load_pd( (p) )
62 34209894 : #define vd_ldu(p) _mm_loadu_pd( (p) )
63 17104947 : #define vd_st(p,d) _mm_store_pd( (p), (d) )
64 34209894 : #define vd_stu(p,d) _mm_storeu_pd( (p), (d) )
65 :
66 : /* vd_ldif is an optimized equivalent to vd_notczero(c,vd_ldu(p)) (may
67 : have different behavior if c is not a proper vector conditional). It
68 : is provided for symmetry with the vd_stif operation. vd_stif stores
69 : x(n) to p[n] if c(n) is true and leaves p[n] unchanged otherwise.
70 : Undefined behavior if c(n) is not a proper paired lane vector
71 : conditional. */
72 :
73 : #define vd_ldif(c,p) _mm_maskload_pd( (p),(c))
74 : #define vd_stif(c,p,x) _mm_maskstore_pd((p),(c),(x))
75 :
76 : /* Element operations */
77 :
78 : /* vd_extract extracts the double in 64-bit lane imm (e.g. indexed
79 : 0 and 1 corresponding to 32-bit pairs of lanes 0-1 and 2-3
80 : respectively) from the vector double as a double. vd_insert returns
81 : the vector double formed by replacing the value in 64-bit lane imm of
82 : a with the provided double. imm should be a compile time constant in
83 : 0:1. vd_extract_variable and vd_insert_variable are the slower but
84 : the 64-bit lane n does not have to be known at compile time (should
85 : still be in 0:1). */
86 :
87 : static inline double
88 34209894 : vd_extract( vd_t a, int imm ) { /* FIXME: USE EPI64 HACKS? */
89 34209894 : double d[2] V_ATTR;
90 34209894 : _mm_store_pd( d, a );
91 34209894 : return d[imm];
92 34209894 : }
93 :
94 : static inline vd_t
95 34209894 : vd_insert( vd_t a, int imm, double v ) { /* FIXME: USE EPI64 HACKS? */
96 34209894 : double d[2] V_ATTR;
97 34209894 : _mm_store_pd( d, a );
98 34209894 : d[imm] = v;
99 34209894 : return _mm_load_pd( d );
100 34209894 : }
101 :
102 : static inline double
103 34209894 : vd_extract_variable( vd_t a, int n ) {
104 34209894 : double d[2] V_ATTR;
105 34209894 : _mm_store_pd( d, a );
106 34209894 : return d[n];
107 34209894 : }
108 :
109 : static inline vd_t
110 34209894 : vd_insert_variable( vd_t a, int n, double v ) {
111 34209894 : double d[2] V_ATTR;
112 34209894 : _mm_store_pd( d, a );
113 34209894 : d[n] = v;
114 34209894 : return _mm_load_pd( d );
115 34209894 : }
116 :
117 : /* Arithmetic operations */
118 :
119 : /* vd_neg(a) returns [ -a0 -a1 ] (i.e. -a )
120 : vd_sign(a) returns [ sign(a0) sign(a1) ]
121 : vd_abs(a) returns [ fabs(a0) fabs(a1) ] (i.e. abs(a))
122 : vd_negabs(a) returns [ -fabs(a0) -fabs(a1) ] (i.e. -abs(a))
123 : vd_ceil(a) returns [ ceil(a0) ceil(a1) ] (i.e. ceil(a))
124 : vd_floor(a) returns [ floor(a0) floor(a1) ] (i.e. floor(a))
125 : vd_rint(a) returns [ rint(a0) rint(a1) ] (i.e. roundb(a))
126 : vd_trunc(a) returns [ trunc(a0) trunc(a1) ] (i.e. fix(a))
127 : vd_sqrt(a) returns [ sqrt(a0) sqrt(a1) ] (i.e. sqrt(a))
128 : vd_rcp_fast(a) returns [ ~rcp(a0) ~rcp(a1) ]
129 : vd_rsqrt_fast(a) returns [ ~rsqrt(a0) ~rsqrt(a1) ]
130 :
131 : vd_add( a,b) returns [ a0+b0 a1+b1 ] (i.e. a +b)
132 : vd_sub( a,b) returns [ a0-b0 a1-b1 ] (i.e. a -b)
133 : vd_mul( a,b) returns [ a0*b0 a1*b1 ] (i.e. a.*b)
134 : vd_div( a,b) returns [ a0/b0 a1/b1 ] (i.e. a./b)
135 : vd_min( a,b) returns [ fmin(a0,b0) fmin(a1,b1) ] (i.e. min([a;b]) (a and b are 1x2)
136 : vd_max( a,b) returns [ fmax(a0,b0) fmax(a1,b1) ] (i.e. max([a;b]) (a and b are 1x2)
137 : vd_copysign(a,b) returns [ copysign(a0,b0) copysign(a1,b1) ]
138 : vd_flipsign(a,b) returns [ flipsign(a0,b0) flipsign(a1,b1) ]
139 :
140 : vd_fma( a,b,c) returns [ fma(a0,b0, c0) fma(a1,b1, c1) ] (i.e. a.*b+c)
141 : vd_fms( a,b,c) returns [ fma(a0,b0,-c0) fma(a1,b1,-c1) ] (i.e. a.*b-c)
142 : vd_fnma(a,b,c) returns [ -fma(a0,b0,-c0) -fma(a1,b1,-c1) ] (i.e. -a.*b+c)
143 :
144 : where sign(a) is -1. if a's sign bit is set and +1. otherwise, rcp(a)
145 : is 1./a, rsqrt(a) is 1./sqrt(a), and flipsign(a,b) returns -a if b
146 : signbit is set and a otherwise.
147 :
148 : rint is in round-to-nearest-even rounding mode (note rint and
149 : nearbyint are identical once floating point exceptions are ignored).
150 :
151 : sqrt should typically be full accuracy.
152 :
153 : rcp_fast and rsqrt_fast should typically be ~12 bits or more bits
154 : accurate (~3 or more decimal digits) such that (nearly) full accuracy
155 : can be achieved with two to three rounds of Newton-Raphson polishing.
156 : Bit level replicable code should avoid rcp_fast and rsqrt_fast though
157 : as the approximations used can vary between various generations /
158 : steppings / microcode updates of x86 processors (including Intel and
159 : AMD). */
160 :
161 : #define vd_neg(a) _mm_xor_pd( _mm_set1_pd( -0. ), (a) )
162 : #define vd_sign(a) _mm_xor_pd( _mm_set1_pd( 1. ), _mm_and_pd( _mm_set1_pd( -0. ), (a) ) )
163 : #define vd_abs(a) _mm_andnot_pd( _mm_set1_pd( -0. ), (a) )
164 : #define vd_negabs(a) _mm_or_pd( _mm_set1_pd( -0. ), (a) )
165 : #define vd_ceil(a) _mm_ceil_pd( (a) )
166 : #define vd_floor(a) _mm_floor_pd( (a) )
167 : #define vd_rint(a) _mm_round_pd( (a), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )
168 : #define vd_trunc(a) _mm_round_pd( (a), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC )
169 : #define vd_sqrt(a) _mm_sqrt_pd( (a) )
170 : #define vd_rcp_fast(a) _mm_cvtps_pd( _mm_rcp_ps( _mm_cvtpd_ps( (a) ) ) )
171 : #define vd_rsqrt_fast(a) _mm_cvtps_pd( _mm_rsqrt_ps( _mm_cvtpd_ps( (a) ) ) )
172 :
173 : #define vd_add(a,b) _mm_add_pd( (a), (b) )
174 524288 : #define vd_sub(a,b) _mm_sub_pd( (a), (b) )
175 : #define vd_mul(a,b) _mm_mul_pd( (a), (b) )
176 : #define vd_div(a,b) _mm_div_pd( (a), (b) )
177 : #define vd_min(a,b) _mm_min_pd( (a), (b) )
178 : #define vd_max(a,b) _mm_max_pd( (a), (b) )
179 : #define vd_copysign(a,b) _mm_or_pd( _mm_andnot_pd( _mm_set1_pd( -0. ), (a) ), _mm_and_pd( _mm_set1_pd( -0. ), (b) ) )
180 : #define vd_flipsign(a,b) _mm_xor_pd( (a), _mm_and_pd( _mm_set1_pd( -0. ), (b) ) )
181 :
182 : #define vd_fma(a,b,c) _mm_fmadd_pd( (a), (b), (c) )
183 : #define vd_fms(a,b,c) _mm_fmsub_pd( (a), (b), (c) )
184 : #define vd_fnma(a,b,c) _mm_fnmadd_pd( (a), (b), (c) )
185 :
186 : /* Binary operations */
187 :
188 : /* Note: binary operations are not well defined on vector doubles.
189 : If doing tricks with floating point binary representations, the user
190 : should use vd_to_vi_raw as necessary. */
191 :
192 : /* Logical operations */
193 :
194 : /* These all return proper paired lane vector conditionals */
195 :
196 : #define vd_lnot(a) /* [ !a0 !a0 !a1 !a1 ] */ \
197 : _mm_castpd_si128( _mm_cmp_pd( (a), _mm_setzero_pd(), _CMP_EQ_OQ ) )
198 : #define vd_lnotnot(a) /* [ !!a0 !!a0 !!a1 !!a1 ] */ \
199 : _mm_castpd_si128( _mm_cmp_pd( (a), _mm_setzero_pd(), _CMP_NEQ_OQ ) )
200 : #define vd_signbit(a) /* [ signbit(a0) signbit(a0) signbit(a1) signbit(a1) ] */ \
201 : _mm_castps_si128( _mm_permute_ps( _mm_castsi128_ps( _mm_srai_epi32( _mm_castpd_si128( (a) ), 31 ) ), _MM_SHUFFLE(3,3,1,1) ) )
202 :
203 : #define vd_eq(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_EQ_OQ ) ) /* [ a0==b0 a0==b0 a1==b1 a1==b1 ] */
204 : #define vd_gt(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_GT_OQ ) ) /* [ a0> b0 a0> b0 a1> b1 a1> b1 ] */
205 524288 : #define vd_lt(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_LT_OQ ) ) /* [ a0< b0 a0< b0 a1< b1 a1< b1 ] */
206 : #define vd_ne(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_NEQ_OQ ) ) /* [ a0!=b0 a0!=b0 a1!=b1 a1!=b1 ] */
207 : #define vd_ge(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_GE_OQ ) ) /* [ a0>=b0 a0>=b0 a1>=b1 a1>=b1 ] */
208 : #define vd_le(a,b) _mm_castpd_si128( _mm_cmp_pd( (a), (b), _CMP_LE_OQ ) ) /* [ a0<=b0 a0<=b0 a1<=b1 a1<=b1 ] */
209 :
210 : /* Conditional operations */
211 :
212 : /* c should be a proper paired lane vector conditional for these */
213 :
214 : #define vd_czero(c,a) _mm_andnot_pd( _mm_castsi128_pd( (c) ), (a) ) /* [ c01?0.:a0 c23?0.:a1 ] */
215 : #define vd_notczero(c,a) _mm_and_pd( _mm_castsi128_pd( (c) ), (a) ) /* [ c01?a0:0. c23?a1:0. ] */
216 :
217 524288 : #define vd_if(c,t,f) _mm_blendv_pd( (f), (t), _mm_castsi128_pd( (c) ) ) /* [ c01?t0:f0 c23?t1:f1 ] */
218 :
219 : /* Conversion operations */
220 :
221 : /* Summarizing:
222 :
223 : vd_to_vc(d) returns [ !!d0 !!d0 !!d1 !!d1 ] ... proper paired lane
224 :
225 : vd_to_vf(d,f,0) returns [ (float)d0 (float)d1 f2 f3 ]
226 : vd_to_vf(d,f,1) returns [ f0 f1 (float)d0 (float)d1 ]
227 :
228 : vd_to_vi(d,i,0) returns [ (int)d0 (int)d1 i2 i3 ]
229 : vd_to_vi(d,i,1) returns [ i0 i1 (int)d0 (int)d1 ]
230 :
231 : vd_to_vi_fast(d,i,0) returns [ (int)rint(d0) (int)rint(d1) i2 i3 ]
232 : vd_to_vi_fast(d,i,1) returns [ i0 i1 (int)rint(d0) (int)rint(d1) ]
233 :
234 : vd_to_vu(d,u,0) returns [ (uint)d0 (uint)d1 u2 u3 ]
235 : vd_to_vu(d,u,1) returns [ u0 u1 (uint)d0 (uint)d1 ]
236 :
237 : vd_to_vu_fast(d,u,0) returns [ (uint)rint(d0) (uint)rint(d1) u2 u3 ]
238 : vd_to_vu_fast(d,u,1) returns [ u0 u1 (uint)rint(d0) (uint)rint(d1) ]
239 :
240 : vd_to_vl(d) returns [ (long)d0 (long)d1 ]
241 :
242 : vd_to_vv(v) returns [ (ulong)d0 (ulong)d1 ]
243 :
244 : where rint is configured for round-to-nearest-even rounding (Intel
245 : architecture defaults to round-nearest-even here ... sigh, they still
246 : don't fully get it) and imm_hi should be a compile time constant.
247 : That is, the fast variants assume that float point inputs are already
248 : integral value in the appropriate range for the output type.
249 :
250 : Note that vd_to_{vf,vi,vi_fast} insert the converted values into
251 : lanes 0:1 (imm_hi==0) or 2:3 (imm_hi!=0) of the provided vector.
252 :
253 : The raw variants return just the raw bits as the corresponding vector
254 : type. vd_to_vl_raw allows doing advanced bit tricks on a vector
255 : double. The others are probably dubious but are provided for
256 : completeness. */
257 :
258 : #define vd_to_vc(d) _mm_castpd_si128( _mm_cmp_pd( (d), _mm_setzero_pd(), _CMP_NEQ_OQ ) )
259 :
260 393216 : static inline vf_t vd_to_vf( vd_t d, vf_t f, int imm_hi ) {
261 393216 : vf_t _d = _mm_cvtpd_ps( d ); /* [ d0 d1 0 0 ] */
262 393216 : if( imm_hi ) _d = _mm_shuffle_ps( f, _d, _MM_SHUFFLE(1,0,1,0) ); /* Compile time */ /* mm_movelh_ps? */
263 196608 : else _d = _mm_shuffle_ps( _d, f, _MM_SHUFFLE(3,2,1,0) );
264 393216 : return _d;
265 393216 : }
266 :
267 : #define vd_to_vi(d,i,imm_hi) vd_to_vi_fast( _mm_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
268 : #define vd_to_vu(d,i,imm_hi) vd_to_vu_fast( _mm_round_pd( (d), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ), (i), (imm_hi) )
269 :
270 786432 : static inline vi_t vd_to_vi_fast( vd_t d, vi_t i, int imm_hi ) {
271 786432 : vf_t _d = _mm_castsi128_ps( _mm_cvtpd_epi32( d ) ); /* [ d0 d1 0 0 ] */
272 786432 : vf_t _i = _mm_castsi128_ps( i );
273 786432 : if( imm_hi ) _d = _mm_shuffle_ps( _i, _d, _MM_SHUFFLE(1,0,1,0) ); /* Compile time */
274 393216 : else _d = _mm_shuffle_ps( _d, _i, _MM_SHUFFLE(3,2,1,0) );
275 786432 : return _mm_castps_si128( _d );
276 786432 : }
277 :
278 786432 : static inline vu_t vd_to_vu_fast( vd_t d, vu_t u, int imm_hi ) {
279 :
280 : /* Note: Given that _mm_cvtpd_epi32 existed for a long time, Intel
281 : clearly had the hardware under the hood for _mm_cvtpd_epu32 but
282 : didn't bother to expose it pre-Skylake-X ... sigh (all too typical
283 : unfortunately). We use _mm_cvtpd_epu32 where supported because it
284 : is faster and it replicates the same IB behaviors as the compiler
285 : generated scalar ASM for float to uint casts on these targets.
286 :
287 : Pre-Skylake-X, we emulate it by noting that subtracting 2^31
288 : from a double holding an integer in [0,2^32) is exact and the
289 : result can be exactly converted to a signed integer by
290 : _mm_cvtpd_epi32. We then use twos complement hacks to add back
291 : any shift. This also replicates the compiler's IB behaviors on
292 : these ISAs for float to int casts. */
293 :
294 262144 : # if defined(__AVX512F__) && defined(__AVX512VL__)
295 262144 : vu_t v = _mm_cvtpd_epu32( d );
296 : # else
297 : /**/ // Assumes d is integer in [0,2^32)
298 524288 : vd_t s = vd_bcast( (double)(1UL<<31) ); // (double)2^31
299 524288 : vc_t c = vd_lt ( d, s ); // -1L if d<2^31, 0L o.w.
300 524288 : vd_t ds = vd_sub( d, s ); // (double)(d-2^31)
301 524288 : vu_t v0 = _mm_cvtpd_epi32( vd_if( c, d, ds ) ); // (uint)(d if d<2^31, d-2^31 o.w.), d/c lanes 2,3
302 524288 : vu_t v1 = vu_add( v0, vu_bcast( 1U<<31) ); // (uint)(d+2^31 if d<2^31, d o.w.), d/c lanes 2,3
303 524288 : vu_t v = vu_if( vc_permute( c, 0,2,0,2 ), v0, v1 ); // (uint)d, d/c lanes 2,3
304 524288 : # endif
305 : /* Compile time */
306 786432 : return imm_hi ? _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( u ), _mm_castsi128_ps( v ), _MM_SHUFFLE(1,0,1,0) ) )
307 786432 : : _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v ), _mm_castsi128_ps( u ), _MM_SHUFFLE(3,2,1,0) ) );
308 786432 : }
309 :
310 : /* FIXME: IS IT FASTER TO USE INSERT / EXTRACT FOR THESE? */
311 :
312 196608 : static inline __m128i vd_to_vl( vd_t d ) { /* FIXME: workaround vl_t isn't declared at this point */
313 196608 : union { double d[2]; __m128d v[1]; } t[1];
314 196608 : union { long l[2]; __m128i v[1]; } u[1];
315 196608 : _mm_store_pd( t->d, d );
316 196608 : u->l[0] = (long)t->d[0];
317 196608 : u->l[1] = (long)t->d[1];
318 196608 : return _mm_load_si128( u->v );
319 196608 : }
320 :
321 196608 : static inline __m128i vd_to_vv( vd_t d ) { /* FIXME: workaround vv_t isn't declared at this point */
322 196608 : union { double d[2]; __m128d v[1]; } t[1];
323 196608 : union { ulong u[2]; __m128i v[1]; } u[1];
324 196608 : _mm_store_pd( t->d, d );
325 196608 : u->u[0] = (ulong)t->d[0];
326 196608 : u->u[1] = (ulong)t->d[1];
327 196608 : return _mm_load_si128( u->v );
328 196608 : }
329 :
330 : #define vd_to_vc_raw(a) _mm_castpd_si128( (a) )
331 : #define vd_to_vf_raw(a) _mm_castpd_ps( (a) )
332 : #define vd_to_vi_raw(a) _mm_castpd_si128( (a) )
333 : #define vd_to_vu_raw(a) _mm_castpd_si128( (a) )
334 : #define vd_to_vl_raw(a) _mm_castpd_si128( (a) )
335 : #define vd_to_vv_raw(a) _mm_castpd_si128( (a) )
336 :
337 : /* Reduction operations */
338 :
339 : static inline vd_t
340 196608 : vd_sum_all( vd_t x ) { /* Returns vd_bcast( sum( x ) ) */
341 196608 : return _mm_hadd_pd( x, x ); /* xsum ... */
342 196608 : }
343 :
344 : static inline vd_t
345 196608 : vd_min_all( vd_t a ) { /* Returns vd_bcast( min( x ) ) */
346 196608 : return _mm_min_pd( a, _mm_permute_pd( a, 1 ) );
347 196608 : }
348 :
349 : static inline vd_t
350 196608 : vd_max_all( vd_t a ) { /* Returns vd_bcast( max( x ) ) */
351 196608 : return _mm_max_pd( a, _mm_permute_pd( a, 1 ) );
352 196608 : }
353 :
354 : /* Misc operations */
355 :
356 : /* vd_gather(b,i,imm_i0,imm_i1) returns [ b[i(imm_i0)] b[i(imm_i1)] ]
357 : where b is a "double const *" and i is a vi_t and imm_i0,imm_i1 are
358 : compile time constants in 0:3. */
359 :
360 68419788 : #define vd_gather(b,i,imm_i0,imm_i1) _mm_i32gather_pd( (b), _mm_shuffle_epi32( (i), _MM_SHUFFLE(3,2,(imm_i1),(imm_i0))), 8 )
361 :
362 : /* vd_transpose_2x2 transposes the 2x2 matrix stored in vd_t r0,r1
363 : and stores the result in 2x2 matrix vd_t c0,c1. All c0,c1 should be
364 : different for a well defined result. Otherwise, in-place operation
365 : and/or using the same vd_t to specify multiple rows of r is fine. */
366 :
367 196608 : #define vd_transpose_2x2( r0,r1, c0,c1 ) do { \
368 196608 : vd_t _vd_transpose_r0 = (r0); vd_t _vd_transpose_r1 = (r1); \
369 196608 : (c0) = _mm_unpacklo_pd( _vd_transpose_r0, _vd_transpose_r1 ); \
370 196608 : (c1) = _mm_unpackhi_pd( _vd_transpose_r0, _vd_transpose_r1 ); \
371 196608 : } while(0)
|