for (int l = 0; l < loops;l++) {
for (int l = 0; l != loops;l++) {
如果你运行它(在Windows 10,Visual Studio 2017,发布),你会发现第一个比第二个快两倍以上.
int forloop_inf(int loops,int iterations) { int n = 0; int x = n; for (int l = 0; l < loops;l++) { for (int i = 0; i < iterations;i++) { n++; x += n; } } return x; }
int forloop_diff(int loops,int iterations) { int n = 0; int x = n; for (int l = 0; l != loops;l++) { for (int i = 0; i != iterations;i++) { n++; x += n; } } return x; }
printf("for loop inf %f\n",monitor_int(loops,iterations,forloop_inf,&result)); printf("%d\n",result);
printf("for loop diff %f\n",forloop_diff,result);
其中loops = 10 * 1000,迭代次数= 1000 * 1000.
double monitor_int(int loops,int iterations,int(*func)(int,int),int *result) { clock_t start = clock(); *result = func(loops,iterations); clock_t stop = clock(); return (double)(stop - start) / CLOCKS_PER_SEC; }
for loop inf 2.227 seconds for loop diff 4.558 seconds
相应的反汇编是here(使用dumpbin / DISASM CPerf2.exe获得).
>’<' 0.031010(平均507次运行)
我不知道如何在Visual Studio中设置O3,编译命令行如下:
/permissive- /Yu”stdafx.h” /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd”x64\Release\vc141.pdb” /Zc:inline /fp:precise /D “NDEBUG” /D “_CONSOLE” /D “_UNICODE” /D “UNICODE” /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MD /FC /Fa”x64\Release\” /EHsc /nologo /Fo”x64\Release\” /Ot /Fp”x64\Release\CPerf2.pch” /diagnostics:classic
typedef int(loop_signature)(int,int); void loops_compare() { int loops = 1 * 100; int iterations = 1000 * 1000; int result; loop_signature *functions[2] = { forloop_diff,forloop_inf }; int n_rand = 1000; int n[2] = { 0,0 }; double cum[2] = { 0.0,0.0 }; for (int i = 0; i < n_rand;i++) { int pick = rand() % 2; loop_signature *fun = functions[pick]; double time = monitor(loops,fun,&result); n[pick]++; cum[pick] += time; } printf("'!=' %f (%d) / '<' %f (%d)\n",cum[0] / (double)n[0],n[0],cum[1] / (double)n[1],n[1]); }
?forloop_inf@@YAHHH@Z: 0000000140001000: 48 83 EC 08 sub rsp,8 0000000140001004: 45 33 C0 xor r8d,r8d 0000000140001007: 45 33 D2 xor r10d,r10d 000000014000100A: 44 8B DA mov r11d,edx 000000014000100D: 85 C9 test ecx,ecx 000000014000100F: 7E 6F jle 0000000140001080 0000000140001011: 48 89 1C 24 mov qword ptr [rsp],rbx 0000000140001015: 8B D9 mov ebx,ecx 0000000140001017: 66 0F 1F 84 00 00 nop word ptr [rax+rax] 00 00 00 0000000140001020: 45 33 C9 xor r9d,r9d 0000000140001023: 33 D2 xor edx,edx 0000000140001025: 33 C0 xor eax,eax 0000000140001027: 41 83 FB 02 cmp r11d,2 000000014000102B: 7C 29 jl 0000000140001056 000000014000102D: 41 8D 43 FE lea eax,[r11-2] 0000000140001031: D1 E8 shr eax,1 0000000140001033: FF C0 inc eax 0000000140001035: 8B C8 mov ecx,eax 0000000140001037: 03 C0 add eax,eax 0000000140001039: 0F 1F 80 00 00 00 nop dword ptr [rax] 00 0000000140001040: 41 FF C1 inc r9d 0000000140001043: 83 C2 02 add edx,2 0000000140001046: 45 03 C8 add r9d,r8d 0000000140001049: 41 03 D0 add edx,r8d 000000014000104C: 41 83 C0 02 add r8d,2 0000000140001050: 48 83 E9 01 sub rcx,1 0000000140001054: 75 EA jne 0000000140001040 0000000140001056: 41 3B C3 cmp eax,r11d 0000000140001059: 7D 06 jge 0000000140001061 000000014000105B: 41 FF C2 inc r10d 000000014000105E: 45 03 D0 add r10d,r8d 0000000140001061: 42 8D 0C 0A lea ecx,[rdx+r9] 0000000140001065: 44 03 D1 add r10d,ecx 0000000140001068: 41 8D 48 01 lea ecx,[r8+1] 000000014000106C: 41 3B C3 cmp eax,r11d 000000014000106F: 41 0F 4D C8 cmovge ecx,r8d 0000000140001073: 44 8B C1 mov r8d,ecx 0000000140001076: 48 83 EB 01 sub rbx,1 000000014000107A: 75 A4 jne 0000000140001020 000000014000107C: 48 8B 1C 24 mov rbx,qword ptr [rsp] 0000000140001080: 41 8B C2 mov eax,r10d 0000000140001083: 48 83 C4 08 add rsp,8 0000000140001087: C3 ret 0000000140001088: CC CC CC CC CC CC CC CC ÌÌÌÌÌÌÌÌ ?forloop_diff@@YAHHH@Z: 0000000140001090: 45 33 C0 xor r8d,r8d 0000000140001093: 41 8B C0 mov eax,r8d 0000000140001096: 85 C9 test ecx,ecx 0000000140001098: 74 28 je 00000001400010C2 000000014000109A: 44 8B C9 mov r9d,ecx 000000014000109D: 0F 1F 00 nop dword ptr [rax] 00000001400010A0: 85 D2 test edx,edx 00000001400010A2: 74 18 je 00000001400010BC 00000001400010A4: 8B CA mov ecx,edx 00000001400010A6: 66 66 0F 1F 84 00 nop word ptr [rax+rax] 00 00 00 00 00000001400010B0: 41 FF C0 inc r8d 00000001400010B3: 41 03 C0 add eax,r8d 00000001400010B6: 48 83 E9 01 sub rcx,1 00000001400010BA: 75 F4 jne 00000001400010B0 00000001400010BC: 49 83 E9 01 sub r9,1 00000001400010C0: 75 DE jne 00000001400010A0 00000001400010C2: C3 ret 00000001400010C3: CC CC CC CC CC CC CC CC CC CC CC CC CC ÌÌÌÌÌÌÌÌÌÌÌÌÌ
>那么如果在此之后出现这种差异,如何对自己编码的内容充满信心呢? (考虑到我在某处没有犯错)
typedef int(signature)(int,int); ... int main() { int loops,runs; fprintf(stderr,"Loops: "); scanf("%d",&loops); fprintf(stderr,"Iterations: "); scanf("%d",&iterations); fprintf(stderr,"Runs: "); scanf("%d",&runs); fprintf(stderr,"Running for %d loops and %d iterations %d times.\n",loops,runs); signature *functions[2] = { forloop_inf,forloop_diff }; int result = functions[0](loops,iterations); for( int i = 0; i < runs; i++ ) { int pick = rand() % 2; signature *function = functions[pick]; int new_result; printf("%d %f\n",pick,function,&new_result)); if( result != new_result ) { fprintf(stderr,"got %d expected %d\n",new_result,result); } } }
我的研究结果表明,Apple LLVM版本8.0.0(clang-800.0.42.1)在-O2 forloop_inf上执行10000次循环和1000000次迭代确实比forloop_diff快50%.
forloop_inf: 0.000009 forloop_diff: 0.000014
看着the generated assembly code for -O2与clang -O2 -S -mllvm –x86-asm-Syntax = intel test.c我可以看到many differences between the two implementations.也许知道汇编的人可以告诉我们原因.
forloop_inf: 0.000002 forloop_diff: 0.000002
这是因为at -O3
they are almost exactly the same.一个使用je
我< max比i!= max更安全,因为如果我以某种方式跳过最大值它仍会终止. 正如所展示的那样,在优化开启的情况下,它们都非常快,甚至没有完全优化,它们可以在0.000009秒内完成10,000,000次迭代.我< max或i!= max不太可能成为性能瓶颈,而不管你做了100亿次. 但是我!= max可能会导致错误.