shufps on Core2(including Penryn) brings data into the integer domain, causing a bypass delay to get it back to the FP execution units for addps, but movhlps is entirely in the FP domain. shufpd also runs in the float domain.
// Use dummy = a recently-dead variable that vec depends on,
// so it doesn't introduce a false dependency,
// and the compiler probably still has it in a register
__m128d highhalf_pd(__m128d dummy, __m128d vec) {
#ifdef __AVX__
// With 3-operand AVX instructions, don't create an extra dependency on something we don't need anymore.
(void)dummy;
return _mm_unpackhi_pd(vec, vec);
#else
// Without AVX, we can save a MOVAPS with MOVHLPS into a dead register
__m128 tmp = _mm_castpd_ps(dummy);
__m128d high = _mm_castps_pd(_mm_movehl_ps(tmp, _mm_castpd_ps(vec)));
return high;
#endif
}
SSE1(又名 SSE) :
float hsum_ps_sse1(__m128 v) { // v = [ D C | B A ]
__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ]
__m128 sums = _mm_add_ps(v, shuf); // sums = [ D+C C+D | B+A A+B ]
shuf = _mm_movehl_ps(shuf, sums); // [ C D | D+C C+D ] // let the compiler avoid a mov by reusing shuf
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
# gcc 5.3 -O3: looks optimal
movaps xmm1, xmm0 # I think one movaps is unavoidable, unless we have a 2nd register with known-safe floats in the upper 2 elements
shufps xmm1, xmm0, 177
addps xmm0, xmm1
movhlps xmm1, xmm0 # note the reuse of shuf, avoiding a movaps
addss xmm0, xmm1
# clang 3.7.1 -O3:
movaps xmm1, xmm0
shufps xmm1, xmm1, 177
addps xmm1, xmm0
movaps xmm0, xmm1
shufpd xmm0, xmm0, 1
addss xmm0, xmm1
I reported a 关于对洗牌感到悲观的唠叨. It has its own internal representation for shuffling, and turns that back into shuffles. gcc more often uses the instructions that directly match the intrinsic you used.
通常 clang 比 gcc 做得更好,在指令选择不是手动调优的代码中,或者即使对于非常数情况,内部函数是最优的,常数传播也可以简化事情。总的来说,编译器工作起来像一个适合内部特性的编译器,而不仅仅是一个汇编器,这是一件好事。编译器通常可以从标量 C 中生成好的结果,而这些结果甚至不会像好的结果那样工作。最终,编译器将把内部函数视为优化器的另一个 C 运算符。
If code-size is your major concern, two haddps (_mm_hadd_ps) instructions will do the trick (Paul R's answer). This is also the easiest to type and remember. It is 不是很快, though. Even Intel Skylake still decodes each haddps to 3 uops, with 6 cycle latency. So even though it saves machine-code bytes (L1 I-cache), it takes up more space in the more-valuable uop-cache. Real use-cases for haddps: 一个移位和求和问题, or doing some scaling at an intermediate step 在这个 SSE atoi()实施.
#ifdef __AVX__
float hsum256_ps_avx(__m256 v) {
__m128 vlow = _mm256_castps256_ps128(v);
__m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
vlow = _mm_add_ps(vlow, vhigh); // add the low 128
return hsum_ps_sse3(vlow); // and inline the sse3 version, which is optimal for AVX
// (no wasted instructions, and all of them are the 4B minimum)
}
#endif
vmovaps xmm1,xmm0 # huh, what the heck gcc? Just extract to xmm1
vextractf128 xmm0,ymm0,0x1
vaddps xmm0,xmm1,xmm0
vmovshdup xmm1,xmm0
vaddps xmm0,xmm1,xmm0
vmovhlps xmm1,xmm1,xmm0
vaddss xmm0,xmm0,xmm1
vzeroupper
ret
Double-precision:
double hsum_pd_sse2(__m128d vd) { // v = [ B | A ]
__m128 undef = _mm_undefined_ps(); // don't worry, we only use addSD, never touching the garbage bits with an FP add
__m128 shuftmp= _mm_movehl_ps(undef, _mm_castpd_ps(vd)); // there is no movhlpd
__m128d shuf = _mm_castps_pd(shuftmp);
return _mm_cvtsd_f64(_mm_add_sd(vd, shuf));
}
# gcc 5.3.0 -O3
pxor xmm1, xmm1 # hopefully when inlined, gcc could pick a register it knew wouldn't cause a false dep problem, and avoid the zeroing
movhlps xmm1, xmm0
addsd xmm0, xmm1
# clang 3.7.1 -O3 again doesn't use movhlps:
xorpd xmm2, xmm2 # with #define _mm_undefined_ps _mm_setzero_ps
movapd xmm1, xmm0
unpckhpd xmm1, xmm2
addsd xmm1, xmm0
movapd xmm0, xmm1 # another clang bug: wrong choice of operand order
// This doesn't compile the way it's written
double hsum_pd_scalar_sse2(__m128d vd) {
double tmp;
_mm_storeh_pd(&tmp, vd); // store the high half
double lo = _mm_cvtsd_f64(vd); // cast the low half
return lo+tmp;
}
# gcc 5.3 -O3
haddpd xmm0, xmm0 # Lower latency but less throughput than storing to memory
# ICC13
movhpd QWORD PTR [-8+rsp], xmm0 # only needs the store port, not the shuffle unit
addsd xmm0, QWORD PTR [-8+rsp]
存储到内存和返回避免了 ALU 操作。如果洗牌端口压力,或者一般的 ALU 上升,是一个瓶颈,这是很好的。(注意,它不需要 sub rsp, 8或任何东西,因为 x86-64 SysV ABI 提供了一个信号处理程序不会涉足的红色区域。)
I have doubts, if this would prove to be faster for a vector length of 3 or 4 than presented by Mr Cordes, however for 16 or 8 bit data this method should prove to be worthwhile. Then of course one needs to perform 3 or 4 rounds respectively before the result can be acquired.