The sum of the float array in the assembly

Question

The sum of the float array in the assembly

I am implementing a function in x86 assembly called from a C program to add a float array. The first argument to the function is a pointer to the array, and the second is the number of elements. When I run the code on Linux, I get a segmentation error. What have I done wrong?

.text
.globl floatsum

floatsum:
push %ebp
movl %esp, %ebp

movl  8(%ebp), %eax
movl 12(%ebp), %edx
shrl $2, %edx

xorps %xmm0, %xmm0
loop:
testl %edx, %edx
je end  
movaps (%eax), %xmm1
addps %xmm1, %xmm0
addl $16, %eax
decl %edx
jmp loop 

end:
                            #         3       2      1       0
movaps %xmm0, %xmm1         # xmm0:   w       z      y       x 
                            # xmm1:   z       w      x       y
shufps $0xb1, %xmm1, %xmm1  #        10      11      00      01  = 0xb1
addps  %xmm1, %xmm0         # xmm0:  w+z     z+w     y+x     x+y
movaps %xmm0, %xmm1         # xmm1:  w+z     z+w     y+x     x+y
                            # xmm1:  x+y     y+x     z+w     w+z
shufps $0x1b, %xmm1, %xmm1  #        00      01      10      11  = 0x1b
addps  %xmm1, %xmm0         # xmm0:  w+z+x+y z+w+y+x y+x+z+w x+y+w+z
                            #
#movd %xmm0, %eax
#pushl %eax

finst:

flds (%esp)
popl %eax

movl %ebp, %esp
popl %ebp
ret

// Code C

#include <stdio.h>
#include <stdlib.h>


float
floatsum(float *array, size_t number_of_items);

float
floatsum_c(float *array, size_t number_of_items){
float sum;
size_t i;

sum=0.0;
for(i=0; i<number_of_items;i++){
    sum+=array[i];
}
return sum;
}

float *
create_array(size_t number_of_items){
float *array;
size_t i;

array=calloc(number_of_items, sizeof(float));
if(array){
    for(i=0; i<number_of_items; i++){
        array[i]=1.0+(float)i;
    }
   }
   return array;
 }

int
main(int argc, char **argv){
float *a;
float result;
size_t number_of_items, i;

number_of_items=8;
a=create_array(number_of_items);
if(a){
    result=floatsum_c(a, number_of_items);
    printf("Sum (c version): %f\n", result);    
    result=floatsum(a, number_of_items);
    printf("Sum (asm version): %f\n", result);  
    free(a);
}

return 0;
}

+3

c assembly x86 sse simd

Daniele Apr 12 17 at 14:41

source to share

1 answer

Michael petch · Accepted Answer · 2017-04-12T16:20:51+0000

As Paul said, this is most likely an alignment issue. It is clear from your C code that your floating point array cannot be aligned on a 16 byte boundary. The error is this line:

movaps (%eax), %xmm1

The reason is that MOVAPS has this requirement:

When the source or destination operand is a memory operand, the operand must be 16-byte (128-bit version) or 32-byte (VEX.256 encoded version) boundary aligned, or a general protection exception (#GP) will be thrown.

Since you are working with 128 bit vector registers, you need 16 byte alignment. You have two options:

Change MOVAPS to MOVUPS so that wasteful memory accesses can be performed
Modify your C code to create an array of floats aligned on a 16 byte boundary.

The first solution will require:

movaps (%eax), %xmm1

for change:

movups (%eax), %xmm1

The second solution is to avoid using calloc and use a function that allows you to create objects with a 16 byte alignment. If you are using C11 you can use the aligned_alloc and memset function to zero out the array. Yours create_array

might look like this:

float *
create_array(size_t number_of_items)
{
    float *array = NULL;
    size_t i;

    array=(float *)aligned_alloc(16, number_of_items * sizeof(*array));
    if(array){
        memset (array, 0x00, number_of_items * sizeof(*array));
        for(i=0; i<number_of_items; i++){
            array[i]=1.0+(float)i;
        }
    }
    return array;
}

If you are not using C11, you can use the POSIX posix_memalign and memset function on Linux. The code might look something like this:

float *
create_array(size_t number_of_items)
{
    float *array = NULL;
    size_t i;

    if (!posix_memalign((void **)&array, 16, number_of_items * sizeof(*array))){
        memset (array, 0x00, number_of_items * sizeof(*array));
        for(i=0; i<number_of_items; i++){
            array[i]=1.0+(float)i;
        }
    }
    return array;
}

You will also have to uncomment these lines:

#movd %xmm0, %eax
#pushl %eax

so they look like this:

movd %xmm0, %eax
pushl %eax

Note . Although I use memset to zero out a floating point array like calloc, it is not really needed in your code since you initialize all elements to specific values after that. In your case, the memset call might be removed.

The sum of the float array in the assembly

More articles: