Passing a specific vector element to another vector
How can I extract one float from index to __m256 vector and pass it to result vector?
pseudocode:
__m256 input = { 2, 3, 4, 5, 6, 7, 8, 9 };
__m256 output = __mm256_selectidx(input, 2);
// output [0 .. 7] now consists of input[2], that is, {4, 4, 4, 4, 4, 4, 4, 4}
The related functions seem to be extract / insert or permutation, but the documentation is not enough / I really don't understand. The broadcast family also looks good, but only works with memory operands?
source to share
For AVX only (i.e. without AVX2) you can do this:
#include <stdio.h>
#include <immintrin.h>
#define _mm256_selectidx(v, i) ({\
__m256 vt = _mm256_permute2f128_ps(v, v, (i >> 2) | ((i >> 2) << 4)); \
vt = _mm256_permute_ps(vt, _MM_SHUFFLE(i & 3, i & 3, i & 3, i & 3)); \
})
int main(void)
{
__m256 v0 = _mm256_setr_ps(2, 3, 4, 5, 6, 7, 8, 9);
__m256 v1 = _mm256_selectidx(v0, 2);
float f0[8], f1[8];
_mm256_storeu_ps(f0, v0);
_mm256_storeu_ps(f1, v1);
printf("v0: %g %g %g %g %g %g %g %g\n", f0[0], f0[1], f0[2], f0[3], f0[4], f0[5], f0[6], f0[7]);
printf("v1: %g %g %g %g %g %g %g %g\n", f1[0], f1[1], f1[2], f1[3], f1[4], f1[5], f1[6], f1[7]);
return 0;
}
Test:
$ gcc -Wall -mavx test_avx_select.c && ./a.out
v0: 2 3 4 5 6 7 8 9
v1: 4 4 4 4 4 4 4 4
Note that this code uses the gcc extension for macros that can behave like functions - if you are using a compiler that does not support this extension, you may have to use a built-in function and hope the compiler can handle compile-time constants. required internal AVX characteristics.
source to share
If you have AVX2 then you can use _mm256_permutevar8x32_ps
:
#define _mm256_selectidx(v, i) _mm256_permutevar8x32_ps(v, _mm256_set1_epi32(i))
Obviously this will generate multiple instructions, depending on how your compiler handles it _mm256_set1_epi32
and whether the element index is a compile-time constant or not.
Demo:
#include <stdio.h>
#include <immintrin.h>
#define _mm256_selectidx(v, i) _mm256_permutevar8x32_ps(v, _mm256_set1_epi32(i))
int main(void)
{
__m256 v0 = _mm256_setr_ps(2, 3, 4, 5, 6, 7, 8, 9);
__m256 v1 = _mm256_selectidx(v0, 2);
float f0[8], f1[8];
_mm256_storeu_ps(f0, v0);
_mm256_storeu_ps(f1, v1);
printf("v0: %g %g %g %g %g %g %g %g\n", f0[0], f0[1], f0[2], f0[3], f0[4], f0[5], f0[6], f0[7]);
printf("v1: %g %g %g %g %g %g %g %g\n", f1[0], f1[1], f1[2], f1[3], f1[4], f1[5], f1[6], f1[7]);
return 0;
}
Test:
$ gcc -Wall -mavx2 test_avx2_select.c && ./a.out
v0: 2 3 4 5 6 7 8 9
v1: 4 4 4 4 4 4 4 4
source to share
For SSE, this is much easier - you can simply use _mm_shuffle_ps
:
#include <stdio.h>
#include <xmmintrin.h>
#define _mm_selectidx(v, i) _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i))
int main(void)
{
__m128 v0 = _mm_setr_ps(2, 3, 4, 5);
__m128 v1 = _mm_selectidx(v0, 2);
float f0[4], f1[4];
_mm_storeu_ps(f0, v0);
_mm_storeu_ps(f1, v1);
printf("v0: %g %g %g %g\n", f0[0], f0[1], f0[2], f0[3]);
printf("v1: %g %g %g %g\n", f1[0], f1[1], f1[2], f1[3]);
return 0;
}
Test:
$ gcc -Wall -msse test_sse_select.c && ./a.out
v0: 2 3 4 5
v1: 4 4 4 4
source to share