c++ - Why is the compiler optimizing these cases differently? -

the snippet of code found below demonstrates situation in calling crc32 compiler intrinsic on 7 bytes of data in 2 different ways (e.g. case0() & case1()) results in different compiler optimizations. these differences in compiler optimizations produce vastly different execution times (e.g. [test_construction, case: 0, bytes: 7]).

for reference, have included logic calling crc32 on 6 bytes of data in same fashion. however, can see generated output, resulting execution times not suffer same performance hit experienced when working 7 bytes of data.

generated output of single pass - 4 unique tests each data size of interest (6 & 7 bytes):

test_construction <case: 0, bytes: 7>:    139.5543 ms test_construction <case: 1, bytes: 7>:     38.6545 ms test_reference    <case: 0, bytes: 7>:     26.2616 ms test_reference    <case: 1, bytes: 7>:     38.8118 ms test_construction <case: 0, bytes: 6>:     26.2925 ms test_construction <case: 1, bytes: 6>:     29.5819 ms test_reference    <case: 0, bytes: 6>:     25.3754 ms test_reference    <case: 1, bytes: 6>:     28.7829 ms

i have 2 questions:

why compiler producing different optimizations (e.g. in case of [test_construction, case: 0, bytes: 7]?

it looks when [test_construction, case: 0, bytes: 7] translated machine code contains additional instructions move data stack registers , out on stack. not seem occur in other scenario. crc called once on data found within register , once on data on stack. why this?

why performance dropping in first place?

is due additional stack logic (memory operations) found in [test_construction, case: 0, bytes: 7] machine code?

could order of operations contribute?

is there way stop optimizer producing suboptimal machine code?

update 1 - 4/7/17:

@1201programalarm, johnnycrash
- i want clarify optimize/reduce generated machine code. purposefully overlapped 4th byte in [case: 0, bytes: 7] in order call crc32_u32 twice avoid having make following 3 calls: crc32_u32 + crc32_u16 + crc32_u8.
- as follow suggestion, johnnycrash, attempted remove call memcpy in cfunc's constructor, in case data 7 bytes in size. see code directly below. however, had no effect on execution time.

template<int n> void memcpy(char* szdst, const char* szsrc) {     memcpy(szdst, szsrc, n); }  // tried both of these alternatives memcpy, no luck. template<> void memcpy<7>(char* szdst, const char* szsrc) {     //as4(szdst) = as4(szsrc), as2(szdst+4) = as2(szsrc+4),  as1(szdst+6) = as1(szsrc+6);     as4(szdst) = as4(szsrc), as4(szdst+3) = as4(szsrc+3); }

environment details:

windows server 2012 r2 x64 intel xeon x5670

assembly reference:

------------------------------------------------------- test_construction <case: 0, bytes: 7>:    139.5543 ms ------------------------------------------------------- 00007ff62d7911cc  call        cbench::cbench (07ff62d791000h)   00007ff62d7911d1  xor         r8d,r8d   00007ff62d7911d4  lea         r10,[_a (07ff62d794630h)]   00007ff62d7911db  mov         r9d,1312d00h                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7911e1  mov         rax,r8   00007ff62d7911e4  inc         r8   00007ff62d7911e7  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7911ec  lea         rcx,[rax+rax*2]   00007ff62d7911f0  movzx       eax,word ptr [r10+rcx*8+4]   00007ff62d7911f6  mov         edx,dword ptr [r10+rcx*8]   00007ff62d7911fa  mov         word ptr [rsp+44h],ax   00007ff62d7911ff  movzx       eax,byte ptr [r10+rcx*8+6]   00007ff62d791205  mov         byte ptr [rsp+46h],al                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791209  mov         eax,7   00007ff62d79120e  crc32       eax,edx                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d791213  mov         dword ptr [buf],edx                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791217  crc32       eax,dword ptr [rsp+43h]   00007ff62d79121e  add         ebx,eax   00007ff62d791220  sub         r9,1   00007ff62d791224  jne         test_func<0,7,0>+71h (07ff62d7911e1h)                   }                 return ii; 00007ff62d791226  lea         rcx,[bench]   00007ff62d79122b  call        cbench::~cbench (07ff62d791030h)   ------------------------------------------------------- test_construction <case: 1, bytes: 7>:     38.6545 ms ------------------------------------------------------- 00007ff62d7912a9  call        cbench::cbench (07ff62d791000h)   00007ff62d7912ae  xor         r8d,r8d   00007ff62d7912b1  lea         r10,[_a (07ff62d794630h)]   00007ff62d7912b8  mov         r9d,1312d00h   00007ff62d7912be  xchg        ax,ax                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7912c0  mov         rax,r8   00007ff62d7912c3  inc         r8   00007ff62d7912c6  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7912cb  lea         rcx,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7912cf  movzx       eax,word ptr [r10+rcx*8+4]   00007ff62d7912d5  movzx       edx,byte ptr [r10+rcx*8+6]   00007ff62d7912db  shl         rdx,10h   00007ff62d7912df  or          rdx,rax   00007ff62d7912e2  mov         eax,dword ptr [r10+rcx*8]   00007ff62d7912e6  shl         rdx,20h   00007ff62d7912ea  or          rdx,rax   00007ff62d7912ed  mov         eax,7   00007ff62d7912f2  crc32       rax,rdx   00007ff62d7912f8  add         ebx,eax   00007ff62d7912fa  sub         r9,1   00007ff62d7912fe  jne         test_func<1,7,0>+70h (07ff62d7912c0h)                   }                 return ii; 00007ff62d791300  lea         rcx,[bench]   00007ff62d791305  call        cbench::~cbench (07ff62d791030h)   ------------------------------------------------------- test_reference    <case: 0, bytes: 7>:     26.2616 ms ------------------------------------------------------- 00007ff62d791386  call        cbench::cbench (07ff62d791000h)   00007ff62d79138b  xor         edx,edx   00007ff62d79138d  lea         r9,[_a (07ff62d794630h)]   00007ff62d791394  mov         r8d,1312d00h   00007ff62d79139a  nop         word ptr [rax+rax]                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7913a0  mov         rax,rdx                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7913a3  mov         ecx,7                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7913a8  ,         eax,3ffh   00007ff62d7913ad  inc         rdx                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7913b0  lea         rax,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7913b4  crc32       ecx,dword ptr [r9+rax*8]   00007ff62d7913bb  crc32       ecx,dword ptr [r9+rax*8+3]   00007ff62d7913c3  add         ebx,ecx   00007ff62d7913c5  sub         r8,1   00007ff62d7913c9  jne         test_func<0,7,1>+70h (07ff62d7913a0h)                   }                 return ii; 00007ff62d7913cb  lea         rcx,[bench]   00007ff62d7913d0  call        cbench::~cbench (07ff62d791030h)     -------------------------------------------------------     test_reference    <case: 1, bytes: 7>:     38.8118 ms ------------------------------------------------------- 00007ff62d791449  call        cbench::cbench (07ff62d791000h)   00007ff62d79144e  xor         r8d,r8d   00007ff62d791451  lea         r10,[_a (07ff62d794630h)]   00007ff62d791458  mov         r9d,1312d00h   00007ff62d79145e  xchg        ax,ax                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d791460  mov         rax,r8   00007ff62d791463  inc         r8   00007ff62d791466  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79146b  lea         rax,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79146f  movzx       edx,byte ptr [r10+rax*8+6]   00007ff62d791475  lea         rcx,[r10+rax*8]   00007ff62d791479  movzx       eax,word ptr [r10+rax*8+4]   00007ff62d79147f  shl         rdx,10h   00007ff62d791483  or          rdx,rax   00007ff62d791486  mov         eax,dword ptr [rcx]   00007ff62d791488  shl         rdx,20h   00007ff62d79148c  or          rdx,rax   00007ff62d79148f  mov         eax,7   00007ff62d791494  crc32       rax,rdx   00007ff62d79149a  add         ebx,eax   00007ff62d79149c  sub         r9,1   00007ff62d7914a0  jne         test_func<1,7,1>+70h (07ff62d791460h)                   }                 return ii; 00007ff62d7914a2  lea         rcx,[bench]   00007ff62d7914a7  call        cbench::~cbench (07ff62d791030h)    ------------------------------------------------------- test_construction <case: 0, bytes: 6>:     26.2925 ms ------------------------------------------------------- 00007ff62d791526  call        cbench::cbench (07ff62d791000h)   00007ff62d79152b  xor         r8d,r8d   00007ff62d79152e  lea         r10,[_a (07ff62d794630h)]   00007ff62d791535  mov         r9d,1312d00h   00007ff62d79153b  nop         dword ptr [rax+rax]                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d791540  mov         rax,r8   00007ff62d791543  inc         r8   00007ff62d791546  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79154b  lea         rcx,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79154f  mov         eax,6   00007ff62d791554  crc32       eax,dword ptr [r10+rcx*8]                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79155b  movzx       edx,word ptr [r10+rcx*8+4]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791561  crc32       eax,dx   00007ff62d791567  add         ebx,eax   00007ff62d791569  sub         r9,1   00007ff62d79156d  jne         test_func<0,6,0>+70h (07ff62d791540h)                   }                 return ii; 00007ff62d79156f  lea         rcx,[bench]   00007ff62d791574  call        cbench::~cbench (07ff62d791030h)   ------------------------------------------------------- test_construction <case: 1, bytes: 6>:     29.5819 ms ------------------------------------------------------- 00007ff62d7915f9  call        cbench::cbench (07ff62d791000h)   00007ff62d7915fe  xor         r8d,r8d   00007ff62d791601  lea         r10,[_a (07ff62d794630h)]   00007ff62d791608  mov         r9d,1312d00h   00007ff62d79160e  xchg        ax,ax                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d791610  mov         rax,r8   00007ff62d791613  inc         r8   00007ff62d791616  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d79161b  lea         rcx,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d79161f  mov         eax,dword ptr [r10+rcx*8]   00007ff62d791623  movzx       edx,word ptr [r10+rcx*8+4]   00007ff62d791629  shl         rdx,20h   00007ff62d79162d  or          rdx,rax                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d791630  mov         eax,6   00007ff62d791635  crc32       rax,rdx   00007ff62d79163b  add         ebx,eax   00007ff62d79163d  sub         r9,1   00007ff62d791641  jne         test_func<1,6,0>+70h (07ff62d791610h)                   }                 return ii; 00007ff62d791643  lea         rcx,[bench]   00007ff62d791648  call        cbench::~cbench (07ff62d791030h)    ------------------------------------------------------- test_reference    <case: 0, bytes: 6>:     25.3754 ms ------------------------------------------------------- 00007ff62d7916c6  call        cbench::cbench (07ff62d791000h)   00007ff62d7916cb  xor         edx,edx   00007ff62d7916cd  lea         r9,[_a (07ff62d794630h)]   00007ff62d7916d4  mov         r8d,1312d00h   00007ff62d7916da  nop         word ptr [rax+rax]                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7916e0  mov         rax,rdx                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916e3  mov         ecx,6                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7916e8  ,         eax,3ffh   00007ff62d7916ed  inc         rdx                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7916f0  lea         rax,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916f4  crc32       ecx,dword ptr [r9+rax*8]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7916fb  crc32       ecx,word ptr [r9+rax*8+4]   00007ff62d791704  add         ebx,ecx   00007ff62d791706  sub         r8,1   00007ff62d79170a  jne         test_func<0,6,1>+70h (07ff62d7916e0h)                   }                 return ii; 00007ff62d79170c  lea         rcx,[bench]   00007ff62d791711  call        cbench::~cbench (07ff62d791030h)   ------------------------------------------------------- test_reference    <case: 1, bytes: 6>:     28.7829 ms ------------------------------------------------------- 00007ff62d791799  call        cbench::cbench (07ff62d791000h)   00007ff62d79179e  xor         edx,edx   00007ff62d7917a0  lea         r9,[_a (07ff62d794630h)]   00007ff62d7917a7  mov         r8d,1312d00h   00007ff62d7917ad  nop         dword ptr [rax]                   (int ipass = 0; ipass < passes; ++ipass) {                                 int = ipass & (dimensionof(_a) - 1); 00007ff62d7917b0  mov         rax,rdx   00007ff62d7917b3  inc         rdx   00007ff62d7917b6  ,         eax,3ffh                                   auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]); 00007ff62d7917bb  lea         rax,[rax+rax*2]                                   ii += (case == 1) ? x.case1() : x.case0(); 00007ff62d7917bf  movzx       ecx,word ptr [r9+rax*8+4]   00007ff62d7917c5  mov         eax,dword ptr [r9+rax*8]   00007ff62d7917c9  shl         rcx,20h   00007ff62d7917cd  or          rcx,rax   00007ff62d7917d0  mov         eax,6   00007ff62d7917d5  crc32       rax,rcx   00007ff62d7917db  add         ebx,eax   00007ff62d7917dd  sub         r8,1   00007ff62d7917e1  jne         test_func<1,6,1>+70h (07ff62d7917b0h)                   }                 return ii; 00007ff62d7917e3  lea         rcx,[bench]   00007ff62d7917e8  call        cbench::~cbench (07ff62d791030h)

source code:

#include <windows.h> #include "new" #include <cstdio> #include <intrin.h>  #define dimensionof(x)      (sizeof(x)/sizeof(*(x))) #define inl                 __forceinline #define noinl               __declspec(noinline) #define passes              20000000 #define as1(a_)             (*(u1*)(a_)) #define as2(a_)             (*(u2*)(a_)) #define as3(a_)             ((u4(as1((char*)(a_) + 2))<<16) | as2(a_)) #define as4(a_)             (*(u4*)(a_)) #define as6(a_)             ((u8(as2((char*)(a_) + 4))<<32) | as4(a_)) #define as7(a_)             ((u8(as3((char*)(a_) + 4))<<32) | as4(a_))  typedef unsigned char       u1; typedef unsigned short      u2; typedef unsigned int        u4; typedef unsigned long long  u8;  typedef char tdata[24]; tdata _a[0x400];  // cbench benchmarking code class cbench {     __int64     m_nstart;     const char* m_desc; public:     // no inline declared      // reasoning:   simplifies assembly code.      //              easier see how optimizer optimizes different variations of algorithm.     noinl cbench(const char *szdesc)          : m_desc(szdesc), m_nstart(getbenchmark()) { }      noinl ~cbench() {         __int64 cpufreq, deltatime(getbenchmark() - m_nstart);         queryperformancefrequency((large_integer*) &cpufreq);         double exectimeinms = ((double) deltatime * 1000) / cpufreq;         printf("%s:\t%10.4f ms\n", m_desc, exectimeinms);     }      noinl static __int64 getbenchmark(void) {         __int64 nbenchmark;         queryperformancecounter((large_integer*) &nbenchmark);         return nbenchmark;     } };  // cfunc executes crc32 intrinsics on 6 & 7 bytes in 2 different ways template <int n> struct cfunc {     char m_ach[n];     inl cfunc(const char* sz) {         memcpy(m_ach, sz, n);     }     inl u4 case0() {         return (n == 7) ? _mm_crc32_u32(_mm_crc32_u32(n, as4(m_ach)), as4(m_ach + 3))                         : _mm_crc32_u16(_mm_crc32_u32(n, as4(m_ach)), as2(m_ach + 4));     }     inl u4 case1() {         return (n == 7) ? (u4) _mm_crc32_u64(n, as7(m_ach))                         : (u4) _mm_crc32_u64(n, as6(m_ach));     }  };  // evaluates performance dependent on: //  -   case    :   crc procedure //  -   n       :   number of bytes //  -   useref  :   true,   reference pre-existing cfunc object //                  false,  constructing new cfunc object template<u4 case, int n, bool useref> noinl int test_func(int ii) {     char szdesc[64], buf[64];     (useref) ? sprintf(szdesc, "%-18s<case: %d, bytes: %d>", "test_reference", case, n)               : sprintf(szdesc, "%-18s<case: %d, bytes: %d>", "test_construction", case, n);     cbench bench(szdesc);     (int ipass = 0; ipass < passes; ++ipass) {         int = ipass & (dimensionof(_a) - 1);         auto& x = (useref) ? *(cfunc<n>*)_a[i] : *new(buf) cfunc<n>(_a[i]);         ii += (case == 1) ? x.case1() : x.case0();     }     return ii; }  int main(int argc, char* argv[]) {     (int = 0; < 10; ++i) {         printf("\n>>>>\tpass %d:\n", i);         // execute crc on 7 bytes         // construct cfunc object         argc = test_func<0, 7, false>(argc);         argc = test_func<1, 7, false>(argc);         // reference pre-existing cfunc object         argc = test_func<0, 7, true>(argc);         argc = test_func<1, 7, true>(argc);          // execute crc on 6 bytes         // construct cfunc object         argc = test_func<0, 6, false>(argc);         argc = test_func<1, 6, false>(argc);         // reference pre-existing cfunc object         argc = test_func<0, 6, true>(argc);         argc = test_func<1, 6, true>(argc);     }     printf("\n\ndone\n");     return argc; }

the operations compiler uses copy data 7 byte buffer populate registers differently crc32 call(s) require(s). compiler has go stack registers needed crc32 call. there no combination of 1,2,4 byte reads , writes doesn't require full write stack. when copied 7 bytes 8 byte buffer, duplicating middle byte second unaligned 4 byte mov, compiler saw 2 registers populated crc32 calls , eliminated stack read/write.

125.997 ms: use memcpy, aligned copying, , unaligned crc32:

memcpy(buf, _a[i], 7); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 3));     movzx       eax,word ptr [_a[i]+4]       mov         edx,dword ptr [_a[i]]       mov         word ptr [buf+4],ax       movzx       eax,byte ptr [_a[i]+6]       mov         byte ptr [buf+6],al       xor         eax,eax       crc32       eax,edx       mov         dword ptr [buf],edx       crc32       eax,dword ptr [buf+3]

the first call crc32 can use register edx copy, second call has no register ready. needs result of dword, word, , byte movs buf. on top of suspect compiler sees bunch of aliasing going on here , gets conservative. compiler has no choice build buf on stack , access it.

137.044 ms: memcpy<7>, unaligned overlapped copy 7 char buf, suffers same problem. registers involved in copy step not registers needed crc32 step. has bit more unaligned access, slows down bit:

as4(buf) = as4(_a[i]), as4(buf + 3) = as4(_a[i] + 3); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 3));     mov         eax,dword ptr [_a[i]]       mov         ecx,dword ptr [_a[i]+3]       mov         dword ptr [buf],eax       xor         eax,eax       mov         dword ptr [buf+3],ecx       crc32       eax,dword ptr [buf]       crc32       eax,ecx

16.733 ms: unaligned overlapped access source not overlapped 8 byte dest buf, sees massive improvement! in case, copy middle byte twice, never alias dwords in buf. if _a[i] = "1234567", buf "12344567":

as4(buf) = as4(_a[i]), as4(buf + 4) = as4(_a[i] + 3); ii += _mm_crc32_u32(_mm_crc32_u32(0, as4(buf)), as4(buf + 4));     xor         eax,eax       crc32       eax,dword ptr [_a[i]]       crc32       eax,dword ptr [_a[i]+3]

the call copy first dword buf , call copy second dword buf + 4 use 2 separate registers, can passed crc32 directly, no need use buf. optimizer on subsequent pass notices unused data moved stack , removes related operations.

121.500 ms: tried 64 bit crc on 8 char buf built same way above , lost big. compiler not using single 8 byte register move buf.

as4(buf) = as4(_a[i]), as4(buf + 4) = as4(_a[i] + 3); ii += _mm_crc32_u64(0, as8(buf));     mov         eax,dword ptr [_a[i]]       mov         dword ptr [buf],eax       mov         eax,dword ptr [_a[i]+3]       mov         dword ptr [buf+3],eax       xor         eax,eax       crc32       rax,qword ptr [buf]

20.799 ms: changed move buf 8 bytes instead of 2 x 4 bytes. stopped using stack, still underperformed 3rd method above:

as8(buf) = as4(_a[i]) | ((u8)as4(_a[i] + 3) << 32); ii += _mm_crc32_u64(0, as8(buf));     mov         ecx,dword ptr [_a[i]+3]       mov         eax,dword ptr [_a[i]]       shl         rcx,20h       or          rcx,rax       xor         eax,eax       crc32       rax,rcx

1 took: 125.997 ms 2 took: 137.044 ms 3 took: 16.733 ms 4 took: 121.500 ms 5 took: 20.799 ms

Search This Blog

MOno

c++ - Why is the compiler optimizing these cases differently? -

Comments

Post a Comment

Popular posts from this blog

'hasOwnProperty' in javascript -

python - ValueError: No axis named 1 for object type <class 'pandas.core.series.Series'> -

java - How to implement an entity bound odata action in olingo v4.3 -