Why does the compiler optimize these cases differently?
Below is a code snippet demonstrating a situation where a call to the CRC32 compiler embedded in 7 bytes of data in two different ways (for example, case0 () and case1 ()) results in different compiler optimizations, These differences in compiler optimizations lead to significantly different run times (for example, [Test_Construction, Case: 0, Bytes: 7]).
For reference, I have included logic to call CRC32 on 6 bytes of data in the same way. However, as you can see from the generated output, the resulting runtime does not suffer from the same performance as when dealing with 7 bytes of data.
Generated single pass output - 4 unique tests for each data size of interest (6 and 7 bytes):
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
I have two questions:
- Why does the compiler generate various optimizations (for example, in the case of [Test_Construction, Case: 0, Bytes: 7]?
- It looks like when [Test_Construction, Case: 0, Bytes: 7] is translated to machine code, it contains additional instructions that move data from the stack to registers and then popped onto the stack. This doesn't seem to happen in any other scenario. The CRC is then called once on the data found in the register and once on the data on the stack. Why do this?
- Why is productivity dropping in the first place?
- Is this due to the additional stack logic (memory operations) found in [Test_Construction, Case: 0, Bytes: 7] machine code?
- Can the order of transactions contribute?
- Is there a way to stop the optimizer from generating this suboptimal machine code?
UPDATE 1 - 4/7/17:
- @ 1201ProgramAlarm, johnnycrash
- I just want to clarify that I would like to optimize / reduce the generated machine code. I purposefully overlapped the 4th byte in [Case: 0, Bytes: 7] to call CRC32_u32 twice to avoid having to make the following 3 calls: CRC32_u32 + CRC32_u16 + CRC32_u8.
- As a follow-up to your suggestion, johnnycrash, I tried to completely remove the memcpy call in the CFunc constructor, especially if the data is 7 bytes in size. See the code directly below. However, this did not affect the runtime.
...
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
Environment Details:
Windows Server 2012 R2 x64
Intel Xeon X5670
Assembly link:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
Source:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
source to share
The operations used by the compiler to copy data into a 7-byte buffer fill registers differently than crc32 calls. The compiler has to go onto the stack to get the registers needed to call crc32. There is no combination of 1,2,4 bytes read and write that does not require a full stack write. When I copied 7 bytes into an 8 byte buffer , duplicating the middle byte with the second unaligned 4 byte mov, the compiler saw 2 registers already filled for crc32 calls and eliminated the stack read / write.
125.997ms: Use memcpy, which does an aligned copy, and unligned crc32:
memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
movzx eax,word ptr [_a[i]+4]
mov edx,dword ptr [_a[i]]
mov word ptr [buf+4],ax
movzx eax,byte ptr [_a[i]+6]
mov byte ptr [buf+6],al
xor eax,eax
crc32 eax,edx
mov dword ptr [buf],edx
crc32 eax,dword ptr [buf+3]
The first call to crc32 can use the edx register from the copy, but the second call is caseless. It needs the result of moving DWORD, WORD and BYTE to buf. Also, I suspect the compiler is seeing a bunch of aliases going on here and getting conservative. The compiler has no choice but to build buf on the stack and then access it.
137.044ms: memcpy <7>, an inconsistent superimposed copy at 7 char buf, suffers from the same problem. The registers involved in the copy step are not the registers required for the crc32 step. It has slightly more uniform access, so it slows down a bit:
AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
mov eax,dword ptr [_a[i]]
mov ecx,dword ptr [_a[i]+3]
mov dword ptr [buf],eax
xor eax,eax
mov dword ptr [buf+3],ecx
crc32 eax,dword ptr [buf]
crc32 eax,ecx
16.733ms: Unaligned overlapped source access, but not overlapped in 8-byte dest buf, sees a significant improvement! In this case, we are copying the middle byte twice, but we will never use DWORDS in buf. If _a [i] = "1234567" then buf will be "12344567":
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
xor eax,eax
crc32 eax,dword ptr [_a[i]]
crc32 eax,dword ptr [_a[i]+3]
The call to copy the first DWORD to buf and the call to copy the second DWORD to buf + 4 use 2 separate registers that can be directly passed to crc32, so there is no need to use buf. The optimizer on the subsequent pass notifies unused data pushed onto the stack and discards associated operations.
121,500ms: Then I tried a 64bit crc on 8 char buf built the same as above and lost the big one. The compiler does not use a single 8-byte register to jump to buf.
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
mov eax,dword ptr [_a[i]]
mov dword ptr [buf],eax
mov eax,dword ptr [_a[i]+3]
mov dword ptr [buf+3],eax
xor eax,eax
crc32 rax,qword ptr [buf]
20.799ms: I changed the jump to buf to 8 bytes instead of 2 x 4 bytes. This ended up using the stack, but was still worse than the previous 3rd method:
AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
mov ecx,dword ptr [_a[i]+3]
mov eax,dword ptr [_a[i]]
shl rcx,20h
or rcx,rax
xor eax,eax
crc32 rax,rcx
1 taken: 125.997 ms 2 taken: 137.044 ms 3 taken: 16.733 ms 4: 121.500 ms 5 taken: 20.799 ms
source to share