The sum of the float array in the assembly
I am implementing a function in x86 assembly called from a C program to add a float array. The first argument to the function is a pointer to the array, and the second is the number of elements. When I run the code on Linux, I get a segmentation error. What have I done wrong?
.text
.globl floatsum
floatsum:
push %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movl 12(%ebp), %edx
shrl $2, %edx
xorps %xmm0, %xmm0
loop:
testl %edx, %edx
je end
movaps (%eax), %xmm1
addps %xmm1, %xmm0
addl $16, %eax
decl %edx
jmp loop
end:
# 3 2 1 0
movaps %xmm0, %xmm1 # xmm0: w z y x
# xmm1: z w x y
shufps $0xb1, %xmm1, %xmm1 # 10 11 00 01 = 0xb1
addps %xmm1, %xmm0 # xmm0: w+z z+w y+x x+y
movaps %xmm0, %xmm1 # xmm1: w+z z+w y+x x+y
# xmm1: x+y y+x z+w w+z
shufps $0x1b, %xmm1, %xmm1 # 00 01 10 11 = 0x1b
addps %xmm1, %xmm0 # xmm0: w+z+x+y z+w+y+x y+x+z+w x+y+w+z
#
#movd %xmm0, %eax
#pushl %eax
finst:
flds (%esp)
popl %eax
movl %ebp, %esp
popl %ebp
ret
// Code C
#include <stdio.h>
#include <stdlib.h>
float
floatsum(float *array, size_t number_of_items);
float
floatsum_c(float *array, size_t number_of_items){
float sum;
size_t i;
sum=0.0;
for(i=0; i<number_of_items;i++){
sum+=array[i];
}
return sum;
}
float *
create_array(size_t number_of_items){
float *array;
size_t i;
array=calloc(number_of_items, sizeof(float));
if(array){
for(i=0; i<number_of_items; i++){
array[i]=1.0+(float)i;
}
}
return array;
}
int
main(int argc, char **argv){
float *a;
float result;
size_t number_of_items, i;
number_of_items=8;
a=create_array(number_of_items);
if(a){
result=floatsum_c(a, number_of_items);
printf("Sum (c version): %f\n", result);
result=floatsum(a, number_of_items);
printf("Sum (asm version): %f\n", result);
free(a);
}
return 0;
}
source to share
As Paul said, this is most likely an alignment issue. It is clear from your C code that your floating point array cannot be aligned on a 16 byte boundary. The error is this line:
movaps (%eax), %xmm1
The reason is that MOVAPS has this requirement:
When the source or destination operand is a memory operand, the operand must be 16-byte (128-bit version) or 32-byte (VEX.256 encoded version) boundary aligned, or a general protection exception (#GP) will be thrown.
Since you are working with 128 bit vector registers, you need 16 byte alignment. You have two options:
- Change MOVAPS to MOVUPS so that wasteful memory accesses can be performed
- Modify your C code to create an array of floats aligned on a 16 byte boundary.
The first solution will require:
movaps (%eax), %xmm1
for change:
movups (%eax), %xmm1
The second solution is to avoid using calloc and use a function that allows you to create objects with a 16 byte alignment. If you are using C11 you can use the aligned_alloc and memset function to zero out the array. Yours create_array
might look like this:
float *
create_array(size_t number_of_items)
{
float *array = NULL;
size_t i;
array=(float *)aligned_alloc(16, number_of_items * sizeof(*array));
if(array){
memset (array, 0x00, number_of_items * sizeof(*array));
for(i=0; i<number_of_items; i++){
array[i]=1.0+(float)i;
}
}
return array;
}
If you are not using C11, you can use the POSIX posix_memalign and memset function on Linux. The code might look something like this:
float *
create_array(size_t number_of_items)
{
float *array = NULL;
size_t i;
if (!posix_memalign((void **)&array, 16, number_of_items * sizeof(*array))){
memset (array, 0x00, number_of_items * sizeof(*array));
for(i=0; i<number_of_items; i++){
array[i]=1.0+(float)i;
}
}
return array;
}
You will also have to uncomment these lines:
#movd %xmm0, %eax
#pushl %eax
so they look like this:
movd %xmm0, %eax pushl %eax
Note . Although I use memset to zero out a floating point array like calloc, it is not really needed in your code since you initialize all elements to specific values after that. In your case, the memset call might be removed.
source to share