Why is the "switch" so slow?

In the bytecode interpretation loop after several tests, I am surprised to see that use switch

is the worst choice. Making calls to an array of function pointers or using a gcc-computed extension is goto

always 10-20% faster, the computed version goto

is the fastest. I tested my virtual toy VM with 97 instructions and with a mini fake virtual machine pasted below.

Why is switch

the slowest used?

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>

enum {
    ADD1 = 1,
    ADD2,
    SUB3,
    SUB4,
    MUL5,
    MUL6,
};

static unsigned int number;

static void add1(void) {
    number += 1;
}

static void add2(void) {
    number += 2;
}

static void sub3(void) {
    number -= 3;
}

static void sub4(void) {
    number -= 4;
}

static void mul5(void) {
    number *= 5;
}

static void mul6(void) {
    number *= 6;
}

static void interpret_bytecodes_switch(uint8_t *bcs) {
    while (true) {
        switch (*bcs++) {
        case 0:
            return;
        case ADD1:
            add1();
            break;
        case ADD2:
            add2();
            break;
        case SUB3:
            sub3();
            break;
        case SUB4:
            sub4();
            break;
        case MUL5:
            mul5();
            break;
        case MUL6:
            mul6();
            break;
        }
    }
}

static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
    void (*fs[])(void) = {
        NULL,
        add1,
        add2,
        sub3,
        sub4,
        mul5,
        mul6,
    };
    while (*bcs) {
        fs[*bcs++]();
    }
}

static void interpret_bytecodes_goto(uint8_t *bcs) {
    void *labels[] = {
        &&l_exit,
        &&l_add1,
        &&l_add2,
        &&l_sub3,
        &&l_sub4,
        &&l_mul5,
        &&l_mul6,
    };
    #define JUMP goto *labels[*bcs++]
    JUMP;
l_exit:
    return;
l_add1:
    add1();
    JUMP;
l_add2:
    add2();
    JUMP;
l_sub3:
    sub3();
    JUMP;
l_sub4:
    sub4();
    JUMP;
l_mul5:
    mul5();
    JUMP;
l_mul6:
    mul6();
    JUMP;
    #undef JUMP
}

struct timer {
    clock_t start, end;
};

static void timer_start(struct timer *self) {
    self->start = clock();
}

static void timer_end(struct timer *self) {
    self->end = clock();
}

static double timer_measure(struct timer *self) {
    return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}

static void test(void (*f)(uint8_t *), uint8_t *bcs) {
    number = 0;
    struct timer timer;
    timer_start(&timer);
    f(bcs);
    timer_end(&timer);
    printf("%u %.3fs\n", number, timer_measure(&timer));
}

int main(void) {
    const int N = 300000000;
    srand((unsigned)time(NULL));
    uint8_t *bcs = malloc(N + 1);
    for (int i = 0; i < N; ++i) {
        bcs[i] = rand() % 5 + 1;
    }
    bcs[N] = 0;
    for (int i = 0; i < 10; ++i) {
        printf("%d ", bcs[i]);
    }
    printf("\nswitch\n");
    test(interpret_bytecodes_switch, bcs);
    printf("function pointer\n");
    test(interpret_bytecodes_function_pointer, bcs);
    printf("goto\n");
    test(interpret_bytecodes_goto, bcs);
    return 0;
}

      

result

~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2 
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1 
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4 
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s

      

Below is a corresponding breakdown of the code posted here after optimization -O3

.

interpret_bytecodes_switch:
.L8:
    addq    $1, %rdi
    cmpb    $6, -1(%rdi)
    ja  .L8
    movzbl  -1(%rdi), %edx
    jmp *.L11(,%rdx,8)
.L11:
    .quad   .L10
    .quad   .L12
    .quad   .L13
    .quad   .L14
    .quad   .L15
    .quad   .L16
    .quad   .L17
.L16:
    leal    (%rax,%rax,4), %eax
    jmp .L8
.L15:
    subl    $4, %eax
    jmp .L8
.L14:
    subl    $3, %eax
    jmp .L8
.L13:
    addl    $2, %eax
    jmp .L8
.L12:
    addl    $1, %eax
    jmp .L8
.L10:
    movl    %eax, number(%rip)
    ret
.L17:
    leal    (%rax,%rax,2), %eax
    addl    %eax, %eax
    jmp .L8

interpret_bytecodes_function_pointer:
    pushq   %rbx
    movq    %rdi, %rbx
    subq    $64, %rsp
    movzbl  (%rdi), %eax
    movq    $0, (%rsp)
    movq    $add1, 8(%rsp)
    movq    $add2, 16(%rsp)
    movq    $sub3, 24(%rsp)
    movq    $sub4, 32(%rsp)
    movq    $mul5, 40(%rsp)
    testb   %al, %al
    movq    $mul6, 48(%rsp)
    je  .L19
.L23:
    addq    $1, %rbx
    call    *(%rsp,%rax,8)
    movzbl  (%rbx), %eax
    testb   %al, %al
    jne .L23
.L19:
    addq    $64, %rsp
    popq    %rbx
    ret

interpret_bytecodes_goto:
    movzbl  (%rdi), %eax
    movq    $.L27, -72(%rsp)
    addq    $2, %rdi
    movq    $.L28, -64(%rsp)
    movq    $.L29, -56(%rsp)
    movq    $.L30, -48(%rsp)
    movq    $.L31, -40(%rsp)
    movq    $.L32, -32(%rsp)
    movq    $.L33, -24(%rsp)
    movq    -72(%rsp,%rax,8), %rax
    jmp *%rax
.L33:
    movl    number(%rip), %eax
    leal    (%rax,%rax,2), %eax
    addl    %eax, %eax
    movl    %eax, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
.L35:
    addq    $1, %rdi
    jmp *%rax
.L28:
    addl    $1, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L30:
    subl    $3, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L31:
    subl    $4, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L32:
    movl    number(%rip), %eax
    leal    (%rax,%rax,4), %eax
    movl    %eax, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L29:
    addl    $2, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L27:
    rep ret

      

+3


source to share


3 answers


switch

is the slowest as it has to manage cases default

and it can add additional constraint test that you haven't implemented.



switch

also handles the more general case where the case values โ€‹โ€‹are not in such a simple sequence, which may require additional calculation.

+3


source


I was in the middle of writing a long answer when you posted the assembly code ...
Basically, the goto version uses more "code" to prevent multiple (or separate) statements on each iteration. It's like optimizing for size and speed.



Since your "real work" is negligible, it makes enough difference in the benchmark, but in a real-world scenario this instruction will become insignificant.

+1


source


You are doing a micro test. Trace minerals on modern processors can be affected by all sorts of random or unpredictable effects. There are actually very few runtime differences. However, to make the code comparable, you combined switch calls and functions that in real life would not run for a time critical code.

0


source







All Articles