c++ - Why is the compiler optimizing these cases differently? -
the snippet of code found below demonstrates situation in calling crc32 compiler intrinsic on 7 bytes of data in 2 different ways (e.g. case0() & case1()) results in different compiler optimizations. these differences in compiler optimizations produce vastly different execution times (e.g. [test_construction, case: 0, bytes: 7]).
for reference, have included logic calling crc32 on 6 bytes of data in same fashion. however, can see generated output, resulting execution times not suffer same performance hit experienced when working 7 bytes of data.
generated output of single pass - 4 unique tests each data size of interest (6 & 7 bytes):
test_construction <case: 0, bytes: 7>: 139.5543 ms test_construction <case: 1, bytes: 7>: 38.6545 ms test_reference <case: 0, bytes: 7>: 26.2616 ms test_reference <case: 1, bytes: 7>: 38.8118 ms test_construction <case: 0, bytes: 6>: 26.2925 ms test_construction <case: 1, bytes: 6>: 29.5819 ms test_reference <case: 0, bytes: 6>: 25.3754 ms test_reference <case: 1, bytes: 6>: 28.7829 ms
i have 2 questions:
- why compiler producing different optimizations (e.g. in case of [test_construction, case: 0, bytes: 7]?
- it looks when [test_construction, case: 0, bytes: 7] translated machine code contains additional instructions move data stack registers , out on stack. not seem occur in other scenario. crc called once on data found within register , once on data on stack. why this?
- why performance dropping in first place?
- is due additional stack logic (memory operations) found in [test_construction, case: 0, bytes: 7] machine code?
- could order of operations contribute?
- is there way stop optimizer producing suboptimal machine code?
update 1 - 4/7/17:
- @1201programalarm, johnnycrash
- i want clarify optimize/reduce generated machine code. purposefully overlapped 4th byte in [case: 0, bytes: 7] in order call crc32_u32 twice avoid having make following 3 calls: crc32_u32 + crc32_u16 + crc32_u8.
- as follow suggestion, johnnycrash, attempted remove call memcpy in cfunc's constructor, in case data 7 bytes in size. see code directly below. however, had no effect on execution time.
.
template<int n> void memcpy(char* szdst, const char* szsrc) { memcpy(szdst, szsrc, n); } // tried both of these alternatives memcpy, no luck. template<> void memcpy<7>(char* szdst, const char* szsrc) { //as4(szdst) = as4(szsrc), as2(szdst+4) = as2(szsrc+4), as1(szdst+6) = as1(szsrc+6); as4(szdst) = as4(szsrc), as4(szdst+3) = as4(szsrc+3); }
environment details:
windows server 2012 r2 x64 intel xeon x5670
assembly reference:
------------------------------------------------------- test_construction <case: 0, bytes: 7>: 139.5543 ms ------------------------------------------------------- 00007ff62d7911cc call cbench::cbench (07ff62d791000h) 00007ff62d7911d1 xor r8d,r8d 00007ff62d7911d4 lea r10,[_a (07ff62d794630h)] 00007ff62d7911db mov r9d,1312d00h (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7911e1 mov rax,r8 00007ff62d7911e4 inc r8 00007ff62d7911e7 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7911ec lea rcx,[rax+rax*2] 00007ff62d7911f0 movzx eax,word ptr [r10+rcx*8+4] 00007ff62d7911f6 mov edx,dword ptr [r10+rcx*8] 00007ff62d7911fa mov word ptr [rsp+44h],ax 00007ff62d7911ff movzx eax,byte ptr [r10+rcx*8+6] 00007ff62d791205 mov byte ptr [rsp+46h],al ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791209 mov eax,7 00007ff62d79120e crc32 eax,edx auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d791213 mov dword ptr [buf],edx ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791217 crc32 eax,dword ptr [rsp+43h] 00007ff62d79121e add ebx,eax 00007ff62d791220 sub r9,1 00007ff62d791224 jne test_func<0,7,0>+71h (07ff62d7911e1h) } return ii; 00007ff62d791226 lea rcx,[bench] 00007ff62d79122b call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_construction <case: 1, bytes: 7>: 38.6545 ms ------------------------------------------------------- 00007ff62d7912a9 call cbench::cbench (07ff62d791000h) 00007ff62d7912ae xor r8d,r8d 00007ff62d7912b1 lea r10,[_a (07ff62d794630h)] 00007ff62d7912b8 mov r9d,1312d00h 00007ff62d7912be xchg ax,ax (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7912c0 mov rax,r8 00007ff62d7912c3 inc r8 00007ff62d7912c6 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7912cb lea rcx,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7912cf movzx eax,word ptr [r10+rcx*8+4] 00007ff62d7912d5 movzx edx,byte ptr [r10+rcx*8+6] 00007ff62d7912db shl rdx,10h 00007ff62d7912df or rdx,rax 00007ff62d7912e2 mov eax,dword ptr [r10+rcx*8] 00007ff62d7912e6 shl rdx,20h 00007ff62d7912ea or rdx,rax 00007ff62d7912ed mov eax,7 00007ff62d7912f2 crc32 rax,rdx 00007ff62d7912f8 add ebx,eax 00007ff62d7912fa sub r9,1 00007ff62d7912fe jne test_func<1,7,0>+70h (07ff62d7912c0h) } return ii; 00007ff62d791300 lea rcx,[bench] 00007ff62d791305 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_reference <case: 0, bytes: 7>: 26.2616 ms ------------------------------------------------------- 00007ff62d791386 call cbench::cbench (07ff62d791000h) 00007ff62d79138b xor edx,edx 00007ff62d79138d lea r9,[_a (07ff62d794630h)] 00007ff62d791394 mov r8d,1312d00h 00007ff62d79139a nop word ptr [rax+rax] (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7913a0 mov rax,rdx ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7913a3 mov ecx,7 (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7913a8 , eax,3ffh 00007ff62d7913ad inc rdx auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7913b0 lea rax,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7913b4 crc32 ecx,dword ptr [r9+rax*8] 00007ff62d7913bb crc32 ecx,dword ptr [r9+rax*8+3] 00007ff62d7913c3 add ebx,ecx 00007ff62d7913c5 sub r8,1 00007ff62d7913c9 jne test_func<0,7,1>+70h (07ff62d7913a0h) } return ii; 00007ff62d7913cb lea rcx,[bench] 00007ff62d7913d0 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_reference <case: 1, bytes: 7>: 38.8118 ms ------------------------------------------------------- 00007ff62d791449 call cbench::cbench (07ff62d791000h) 00007ff62d79144e xor r8d,r8d 00007ff62d791451 lea r10,[_a (07ff62d794630h)] 00007ff62d791458 mov r9d,1312d00h 00007ff62d79145e xchg ax,ax (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d791460 mov rax,r8 00007ff62d791463 inc r8 00007ff62d791466 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79146b lea rax,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79146f movzx edx,byte ptr [r10+rax*8+6] 00007ff62d791475 lea rcx,[r10+rax*8] 00007ff62d791479 movzx eax,word ptr [r10+rax*8+4] 00007ff62d79147f shl rdx,10h 00007ff62d791483 or rdx,rax 00007ff62d791486 mov eax,dword ptr [rcx] 00007ff62d791488 shl rdx,20h 00007ff62d79148c or rdx,rax 00007ff62d79148f mov eax,7 00007ff62d791494 crc32 rax,rdx 00007ff62d79149a add ebx,eax 00007ff62d79149c sub r9,1 00007ff62d7914a0 jne test_func<1,7,1>+70h (07ff62d791460h) } return ii; 00007ff62d7914a2 lea rcx,[bench] 00007ff62d7914a7 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_construction <case: 0, bytes: 6>: 26.2925 ms ------------------------------------------------------- 00007ff62d791526 call cbench::cbench (07ff62d791000h) 00007ff62d79152b xor r8d,r8d 00007ff62d79152e lea r10,[_a (07ff62d794630h)] 00007ff62d791535 mov r9d,1312d00h 00007ff62d79153b nop dword ptr [rax+rax] (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d791540 mov rax,r8 00007ff62d791543 inc r8 00007ff62d791546 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79154b lea rcx,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79154f mov eax,6 00007ff62d791554 crc32 eax,dword ptr [r10+rcx*8] auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79155b movzx edx,word ptr [r10+rcx*8+4] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791561 crc32 eax,dx 00007ff62d791567 add ebx,eax 00007ff62d791569 sub r9,1 00007ff62d79156d jne test_func<0,6,0>+70h (07ff62d791540h) } return ii; 00007ff62d79156f lea rcx,[bench] 00007ff62d791574 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_construction <case: 1, bytes: 6>: 29.5819 ms ------------------------------------------------------- 00007ff62d7915f9 call cbench::cbench (07ff62d791000h) 00007ff62d7915fe xor r8d,r8d 00007ff62d791601 lea r10,[_a (07ff62d794630h)] 00007ff62d791608 mov r9d,1312d00h 00007ff62d79160e xchg ax,ax (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d791610 mov rax,r8 00007ff62d791613 inc r8 00007ff62d791616 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79161b lea rcx,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79161f mov eax,dword ptr [r10+rcx*8] 00007ff62d791623 movzx edx,word ptr [r10+rcx*8+4] 00007ff62d791629 shl rdx,20h 00007ff62d79162d or rdx,rax ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791630 mov eax,6 00007ff62d791635 crc32 rax,rdx 00007ff62d79163b add ebx,eax 00007ff62d79163d sub r9,1 00007ff62d791641 jne test_func<1,6,0>+70h (07ff62d791610h) } return ii; 00007ff62d791643 lea rcx,[bench] 00007ff62d791648 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_reference <case: 0, bytes: 6>: 25.3754 ms ------------------------------------------------------- 00007ff62d7916c6 call cbench::cbench (07ff62d791000h) 00007ff62d7916cb xor edx,edx 00007ff62d7916cd lea r9,[_a (07ff62d794630h)] 00007ff62d7916d4 mov r8d,1312d00h 00007ff62d7916da nop word ptr [rax+rax] (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7916e0 mov rax,rdx ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916e3 mov ecx,6 (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7916e8 , eax,3ffh 00007ff62d7916ed inc rdx auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7916f0 lea rax,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916f4 crc32 ecx,dword ptr [r9+rax*8] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916fb crc32 ecx,word ptr [r9+rax*8+4] 00007ff62d791704 add ebx,ecx 00007ff62d791706 sub r8,1 00007ff62d79170a jne test_func<0,6,1>+70h (07ff62d7916e0h) } return ii; 00007ff62d79170c lea rcx,[bench] 00007ff62d791711 call cbench::~cbench (07ff62d791030h) ------------------------------------------------------- test_reference <case: 1, bytes: 6>: 28.7829 ms ------------------------------------------------------- 00007ff62d791799 call cbench::cbench (07ff62d791000h) 00007ff62d79179e xor edx,edx 00007ff62d7917a0 lea r9,[_a (07ff62d794630h)] 00007ff62d7917a7 mov r8d,1312d00h 00007ff62d7917ad nop dword ptr [rax] (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); 00007ff62d7917b0 mov rax,rdx 00007ff62d7917b3 inc rdx 00007ff62d7917b6 , eax,3ffh auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7917bb lea rax,[rax+rax*2] ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7917bf movzx ecx,word ptr [r9+rax*8+4] 00007ff62d7917c5 mov eax,dword ptr [r9+rax*8] 00007ff62d7917c9 shl rcx,20h 00007ff62d7917cd or rcx,rax 00007ff62d7917d0 mov eax,6 00007ff62d7917d5 crc32 rax,rcx 00007ff62d7917db add ebx,eax 00007ff62d7917dd sub r8,1 00007ff62d7917e1 jne test_func<1,6,1>+70h (07ff62d7917b0h) } return ii; 00007ff62d7917e3 lea rcx,[bench] 00007ff62d7917e8 call cbench::~cbench (07ff62d791030h)
source code:
#include <windows.h> #include "new" #include <cstdio> #include <intrin.h> #define dimensionof(x) (sizeof(x)/sizeof(*(x))) #define inl __forceinline #define noinl __declspec(noinline) #define passes 20000000 #define as1(a_) (*(u1*)(a_)) #define as2(a_) (*(u2*)(a_)) #define as3(a_) ((u4(as1((char*)(a_) + 2))<<16) | as2(a_)) #define as4(a_) (*(u4*)(a_)) #define as6(a_) ((u8(as2((char*)(a_) + 4))<<32) | as4(a_)) #define as7(a_) ((u8(as3((char*)(a_) + 4))<<32) | as4(a_)) typedef unsigned char u1; typedef unsigned short u2; typedef unsigned int u4; typedef unsigned long long u8; typedef char tdata[24]; tdata _a[0x400]; // cbench benchmarking code class cbench { __int64 m_nstart; const char* m_desc; public: // no inline declared // reasoning: simplifies assembly code. // easier see how optimizer optimizes different variations of algorithm. noinl cbench(const char *szdesc) : m_desc(szdesc), m_nstart(getbenchmark()) { } noinl ~cbench() { __int64 cpufreq, deltatime(getbenchmark() - m_nstart); queryperformancefrequency((large_integer*) &cpufreq); double exectimeinms = ((double) deltatime * 1000) / cpufreq; printf("%s:\t%10.4f ms\n", m_desc, exectimeinms); } noinl static __int64 getbenchmark(void) { __int64 nbenchmark; queryperformancecounter((large_integer*) &nbenchmark); return nbenchmark; } }; // cfunc executes crc32 intrinsics on 6 & 7 bytes in 2 different ways template <int n> struct cfunc { char m_ach[n]; inl cfunc(const char* sz) { memcpy(m_ach, sz, n); } inl u4 case0() { return (n == 7) ? _mm_crc32_u32(_mm_crc32_u32(n, as4(m_ach)), as4(m_ach + 3)) : _mm_crc32_u16(_mm_crc32_u32(n, as4(m_ach)), as2(m_ach + 4)); } inl u4 case1() { return (n == 7) ? (u4) _mm_crc32_u64(n, as7(m_ach)) : (u4) _mm_crc32_u64(n, as6(m_ach)); } }; // evaluates performance dependent on: // - case : crc procedure // - n : number of bytes // - useref : true, reference pre-existing cfunc object // false, constructing new cfunc object template<u4 case, int n, bool useref> noinl int test_func(int ii) { char szdesc[64], buf[64]; (useref) ? sprintf(szdesc, "%-18s<case: %d, bytes: %d>", "test_reference", case, n) : sprintf(szdesc, "%-18s<case: %d, bytes: %d>", "test_construction", case, n); cbench bench(szdesc); (int ipass = 0; ipass < passes; ++ipass) { int = ipass & (dimensionof(_a) - 1); auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); ii += (case == 1) ? x.case1() : x.case0(); } return ii; } int main(int argc, char* argv[]) { (int = 0; < 10; ++i) { printf("\n>>>>\tpass %d:\n", i); // execute crc on 7 bytes // construct cfunc object argc = test_func<0, 7, false>(argc); argc = test_func<1, 7, false>(argc); // reference pre-existing cfunc object argc = test_func<0, 7, true>(argc); argc = test_func<1, 7, true>(argc); // execute crc on 6 bytes // construct cfunc object argc = test_func<0, 6, false>(argc); argc = test_func<1, 6, false>(argc); // reference pre-existing cfunc object argc = test_func<0, 6, true>(argc); argc = test_func<1, 6, true>(argc); } printf("\n\ndone\n"); return argc; }
the operations compiler uses copy data 7 byte buffer populate registers differently crc32 call(s) require(s). compiler has go stack registers needed crc32 call. there no combination of 1,2,4 byte reads , writes doesn't require full write stack. when copied 7 bytes 8 byte buffer, duplicating middle byte second unaligned 4 byte mov, compiler saw 2 registers populated crc32 calls , eliminated stack read/write.
125.997 ms: use memcpy, aligned copying, , unaligned crc32:
memcpy(buf, _a[i], 7); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 3)); movzx eax,word ptr [_a[i]+4] mov edx,dword ptr [_a[i]] mov word ptr [buf+4],ax movzx eax,byte ptr [_a[i]+6] mov byte ptr [buf+6],al xor eax,eax crc32 eax,edx mov dword ptr [buf],edx crc32 eax,dword ptr [buf+3]
the first call crc32 can use register edx copy, second call has no register ready. needs result of dword, word, , byte movs buf. on top of suspect compiler sees bunch of aliasing going on here , gets conservative. compiler has no choice build buf on stack , access it.
137.044 ms: memcpy<7>, unaligned overlapped copy 7 char buf, suffers same problem. registers involved in copy step not registers needed crc32 step. has bit more unaligned access, slows down bit:
as4(buf) = as4(_a[i]), as4(buf + 3) = as4(_a[i] + 3); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 3)); mov eax,dword ptr [_a[i]] mov ecx,dword ptr [_a[i]+3] mov dword ptr [buf],eax xor eax,eax mov dword ptr [buf+3],ecx crc32 eax,dword ptr [buf] crc32 eax,ecx
16.733 ms: unaligned overlapped access source not overlapped 8 byte dest buf, sees massive improvement! in case, copy middle byte twice, never alias dwords in buf. if _a[i] = "1234567", buf "12344567":
as4(buf) = as4(_a[i]), as4(buf + 4) = as4(_a[i] + 3); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 4)); xor eax,eax crc32 eax,dword ptr [_a[i]] crc32 eax,dword ptr [_a[i]+3]
the call copy first dword buf , call copy second dword buf + 4 use 2 separate registers, can passed crc32 directly, no need use buf. optimizer on subsequent pass notices unused data moved stack , removes related operations.
121.500 ms: tried 64 bit crc on 8 char buf built same way above , lost big. compiler not using single 8 byte register move buf.
as4(buf) = as4(_a[i]), as4(buf + 4) = as4(_a[i] + 3); ii += _mm_crc32_u64(0, as8(buf)); mov eax,dword ptr [_a[i]] mov dword ptr [buf],eax mov eax,dword ptr [_a[i]+3] mov dword ptr [buf+3],eax xor eax,eax crc32 rax,qword ptr [buf]
20.799 ms: changed move buf 8 bytes instead of 2 x 4 bytes. stopped using stack, still underperformed 3rd method above:
as8(buf) = as4(_a[i]) | ((u8)as4(_a[i] + 3) << 32); ii += _mm_crc32_u64(0, as8(buf)); mov ecx,dword ptr [_a[i]+3] mov eax,dword ptr [_a[i]] shl rcx,20h or rcx,rax xor eax,eax crc32 rax,rcx
1 took: 125.997 ms 2 took: 137.044 ms 3 took: 16.733 ms 4 took: 121.500 ms 5 took: 20.799 ms
Comments
Post a Comment