假设a1
、b1
、c1
和d1
指向堆内存,我的数字代码有以下核心循环。
const int n = 100000;
for (int j = 0; j < n; j++) {a1[j] += b1[j];c1[j] += d1[j];}
这个循环通过另一个外部for
循环执行10,000次。为了加快速度,我将代码更改为:
for (int j = 0; j < n; j++) {a1[j] += b1[j];}
for (int j = 0; j < n; j++) {c1[j] += d1[j];}
在Microsoft VisualC++10.0上编译,完全优化,在英特尔酷睿2 Duo(x64)上为32位启用SSE2,第一个示例需要5.5秒,双循环示例仅需要1.9秒。
第一个循环的反汇编基本上看起来像这样(这个块在整个程序中重复了大约五次):
movsd xmm0,mmword ptr [edx+18h]addsd xmm0,mmword ptr [ecx+20h]movsd mmword ptr [ecx+20h],xmm0movsd xmm0,mmword ptr [esi+10h]addsd xmm0,mmword ptr [eax+30h]movsd mmword ptr [eax+30h],xmm0movsd xmm0,mmword ptr [edx+20h]addsd xmm0,mmword ptr [ecx+28h]movsd mmword ptr [ecx+28h],xmm0movsd xmm0,mmword ptr [esi+18h]addsd xmm0,mmword ptr [eax+38h]
双循环示例的每个循环都会生成以下代码(以下块重复大约三次):
addsd xmm0,mmword ptr [eax+28h]movsd mmword ptr [eax+28h],xmm0movsd xmm0,mmword ptr [ecx+20h]addsd xmm0,mmword ptr [eax+30h]movsd mmword ptr [eax+30h],xmm0movsd xmm0,mmword ptr [ecx+28h]addsd xmm0,mmword ptr [eax+38h]movsd mmword ptr [eax+38h],xmm0movsd xmm0,mmword ptr [ecx+30h]addsd xmm0,mmword ptr [eax+40h]movsd mmword ptr [eax+40h],xmm0
这个问题被证明是无关紧要的,因为行为严重依赖于数组(n)和CPU缓存的大小。因此,如果有进一步的兴趣,我重新表述这个问题:
您能否提供一些可靠的见解,以了解导致不同缓存行为的细节,如下图所示的五个区域?
通过为这些CPU提供类似的图表,指出CPU/缓存架构之间的差异可能也很有趣。
这是完整的代码。它使用TBBTick_Count
进行更高分辨率的计时,可以通过不定义TBB_TIMING
宏来禁用:
#include <iostream>#include <iomanip>#include <cmath>#include <string>
//#define TBB_TIMING
#ifdef TBB_TIMING#include <tbb/tick_count.h>using tbb::tick_count;#else#include <time.h>#endif
using namespace std;
//#define preallocate_memory new_cont
enum { new_cont, new_sep };
double *a1, *b1, *c1, *d1;
void allo(int cont, int n){switch(cont) {case new_cont:a1 = new double[n*4];b1 = a1 + n;c1 = b1 + n;d1 = c1 + n;break;case new_sep:a1 = new double[n];b1 = new double[n];c1 = new double[n];d1 = new double[n];break;}
for (int i = 0; i < n; i++) {a1[i] = 1.0;d1[i] = 1.0;c1[i] = 1.0;b1[i] = 1.0;}}
void ff(int cont){switch(cont){case new_sep:delete[] b1;delete[] c1;delete[] d1;case new_cont:delete[] a1;}}
double plain(int n, int m, int cont, int loops){#ifndef preallocate_memoryallo(cont,n);#endif
#ifdef TBB_TIMINGtick_count t0 = tick_count::now();#elseclock_t start = clock();#endif
if (loops == 1) {for (int i = 0; i < m; i++) {for (int j = 0; j < n; j++){a1[j] += b1[j];c1[j] += d1[j];}}} else {for (int i = 0; i < m; i++) {for (int j = 0; j < n; j++) {a1[j] += b1[j];}for (int j = 0; j < n; j++) {c1[j] += d1[j];}}}double ret;
#ifdef TBB_TIMINGtick_count t1 = tick_count::now();ret = 2.0*double(n)*double(m)/(t1-t0).seconds();#elseclock_t end = clock();ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);#endif
#ifndef preallocate_memoryff(cont);#endif
return ret;}
void main(){freopen("C:\\test.csv", "w", stdout);
char *s = " ";
string na[2] ={"new_cont", "new_sep"};
cout << "n";
for (int j = 0; j < 2; j++)for (int i = 1; i <= 2; i++)#ifdef preallocate_memorycout << s << i << "_loops_" << na[preallocate_memory];#elsecout << s << i << "_loops_" << na[j];#endif
cout << endl;
long long nmax = 1000000;
#ifdef preallocate_memoryallo(preallocate_memory, nmax);#endif
for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2))){const long long m = 10000000/n;cout << n;
for (int j = 0; j < 2; j++)for (int i = 1; i <= 2; i++)cout << s << plain(n, m, j, i);cout << endl;}}
它显示了n
的不同值的FLOP/s。