Why is the "switch" so slow?
In the bytecode interpretation loop after several tests, I am surprised to see that use switch
is the worst choice. Making calls to an array of function pointers or using a gcc-computed extension is goto
always 10-20% faster, the computed version goto
is the fastest. I tested my virtual toy VM with 97 instructions and with a mini fake virtual machine pasted below.
Why is switch
the slowest used?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
enum {
ADD1 = 1,
ADD2,
SUB3,
SUB4,
MUL5,
MUL6,
};
static unsigned int number;
static void add1(void) {
number += 1;
}
static void add2(void) {
number += 2;
}
static void sub3(void) {
number -= 3;
}
static void sub4(void) {
number -= 4;
}
static void mul5(void) {
number *= 5;
}
static void mul6(void) {
number *= 6;
}
static void interpret_bytecodes_switch(uint8_t *bcs) {
while (true) {
switch (*bcs++) {
case 0:
return;
case ADD1:
add1();
break;
case ADD2:
add2();
break;
case SUB3:
sub3();
break;
case SUB4:
sub4();
break;
case MUL5:
mul5();
break;
case MUL6:
mul6();
break;
}
}
}
static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
void (*fs[])(void) = {
NULL,
add1,
add2,
sub3,
sub4,
mul5,
mul6,
};
while (*bcs) {
fs[*bcs++]();
}
}
static void interpret_bytecodes_goto(uint8_t *bcs) {
void *labels[] = {
&&l_exit,
&&l_add1,
&&l_add2,
&&l_sub3,
&&l_sub4,
&&l_mul5,
&&l_mul6,
};
#define JUMP goto *labels[*bcs++]
JUMP;
l_exit:
return;
l_add1:
add1();
JUMP;
l_add2:
add2();
JUMP;
l_sub3:
sub3();
JUMP;
l_sub4:
sub4();
JUMP;
l_mul5:
mul5();
JUMP;
l_mul6:
mul6();
JUMP;
#undef JUMP
}
struct timer {
clock_t start, end;
};
static void timer_start(struct timer *self) {
self->start = clock();
}
static void timer_end(struct timer *self) {
self->end = clock();
}
static double timer_measure(struct timer *self) {
return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}
static void test(void (*f)(uint8_t *), uint8_t *bcs) {
number = 0;
struct timer timer;
timer_start(&timer);
f(bcs);
timer_end(&timer);
printf("%u %.3fs\n", number, timer_measure(&timer));
}
int main(void) {
const int N = 300000000;
srand((unsigned)time(NULL));
uint8_t *bcs = malloc(N + 1);
for (int i = 0; i < N; ++i) {
bcs[i] = rand() % 5 + 1;
}
bcs[N] = 0;
for (int i = 0; i < 10; ++i) {
printf("%d ", bcs[i]);
}
printf("\nswitch\n");
test(interpret_bytecodes_switch, bcs);
printf("function pointer\n");
test(interpret_bytecodes_function_pointer, bcs);
printf("goto\n");
test(interpret_bytecodes_goto, bcs);
return 0;
}
result
~$ gcc vm.c -ovm -std=gnu11 -O3 ~$ ./vm 3 4 5 3 3 5 3 3 1 2 switch 3050839589 2.866s function pointer 3050839589 2.573s goto 3050839589 2.433s ~$ ./vm 3 1 1 3 5 5 2 4 5 1 switch 3898179583 2.871s function pointer 3898179583 2.573s goto 3898179583 2.431s ~$ ./vm 5 5 1 2 3 3 1 2 2 4 switch 954521520 2.869s function pointer 954521520 2.574s goto 954521520 2.432s
Below is a corresponding breakdown of the code posted here after optimization -O3
.
interpret_bytecodes_switch: .L8: addq $1, %rdi cmpb $6, -1(%rdi) ja .L8 movzbl -1(%rdi), %edx jmp *.L11(,%rdx,8) .L11: .quad .L10 .quad .L12 .quad .L13 .quad .L14 .quad .L15 .quad .L16 .quad .L17 .L16: leal (%rax,%rax,4), %eax jmp .L8 .L15: subl $4, %eax jmp .L8 .L14: subl $3, %eax jmp .L8 .L13: addl $2, %eax jmp .L8 .L12: addl $1, %eax jmp .L8 .L10: movl %eax, number(%rip) ret .L17: leal (%rax,%rax,2), %eax addl %eax, %eax jmp .L8 interpret_bytecodes_function_pointer: pushq %rbx movq %rdi, %rbx subq $64, %rsp movzbl (%rdi), %eax movq $0, (%rsp) movq $add1, 8(%rsp) movq $add2, 16(%rsp) movq $sub3, 24(%rsp) movq $sub4, 32(%rsp) movq $mul5, 40(%rsp) testb %al, %al movq $mul6, 48(%rsp) je .L19 .L23: addq $1, %rbx call *(%rsp,%rax,8) movzbl (%rbx), %eax testb %al, %al jne .L23 .L19: addq $64, %rsp popq %rbx ret interpret_bytecodes_goto: movzbl (%rdi), %eax movq $.L27, -72(%rsp) addq $2, %rdi movq $.L28, -64(%rsp) movq $.L29, -56(%rsp) movq $.L30, -48(%rsp) movq $.L31, -40(%rsp) movq $.L32, -32(%rsp) movq $.L33, -24(%rsp) movq -72(%rsp,%rax,8), %rax jmp *%rax .L33: movl number(%rip), %eax leal (%rax,%rax,2), %eax addl %eax, %eax movl %eax, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax .L35: addq $1, %rdi jmp *%rax .L28: addl $1, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .L35 .L30: subl $3, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .L35 .L31: subl $4, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .L35 .L32: movl number(%rip), %eax leal (%rax,%rax,4), %eax movl %eax, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .L35 .L29: addl $2, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .L35 .L27: rep ret
source to share
I was in the middle of writing a long answer when you posted the assembly code ...
Basically, the goto version uses more "code" to prevent multiple (or separate) statements on each iteration. It's like optimizing for size and speed.
Since your "real work" is negligible, it makes enough difference in the benchmark, but in a real-world scenario this instruction will become insignificant.
source to share
You are doing a micro test. Trace minerals on modern processors can be affected by all sorts of random or unpredictable effects. There are actually very few runtime differences. However, to make the code comparable, you combined switch calls and functions that in real life would not run for a time critical code.
source to share