# Intel intrinsics: multiply interleaved 8-bit values

I am working on an RGBA32 buffer (8 bits per component) and I will need to multiply each component by a constant and then add each of the multiplication results to the rest as such:

Result = r * x + g * y + b * z + a * w (dot product between two vectors rgba and xyzw)

I'm trying to use Intel SSE capabilities to speed up the process, but I don't know how to do it without shuffling the input.

Is there a way to do this? For example, let's build a register fixing {x, y, z, w, x, y, z, w, x, y, z, w, x, y, z, w} and perform an 8-bit multiply with saturation?

The end goal is to multiply the RGBA vector by the appropriate color conversion matrix:

``````[ 66 129  25 0]   [R]
[-38 -74 112 0] * [G]
[112 -94 -18 0]   [B]
[0     0   0 0]   [A]
```

```

thank

Edit 1: Here's a final function using floating point calculations for more color precision that converts the rgba image to YUV444 using SSE. The function takes 1.9 to 3.5ms to convert a full HD image on an Intel i5 3570k using only one stream (it is very easy to draw this function and it can lead to significant performance improvements):

``````void SSE_rgba2YUV444_FP(char* a, char* y, char* u, char* v)
{
__m128i mask = _mm_setr_epi8(0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f); // Masque de mélange, chaque uint8 donne la position à donner (en offset en octet) du uint8 correspondant
float m[9] = {0.299, 0.587, 0.114, -0.1687, -0.3313, 0.5, 0.5, -0.4187, -0.0813};                                                         // Dans le __m128i que l'on mélange

__m128i row[4];
for(int i=0; i<4; i++) {
}
// row[i] = {rrrrggggbbbbaaaa} tous en uint8t
__m128i t0 = _mm_unpacklo_epi32(row[0], row[1]); //to = {rrrrrrrrgggggggg}
__m128i t1 = _mm_unpacklo_epi32(row[2], row[3]); //t1 = {rrrrrrrrgggggggg}
__m128i t2 = _mm_unpackhi_epi32(row[0], row[1]); //t2 = {bbbbbbbbaaaaaaaa}
__m128i t3 = _mm_unpackhi_epi32(row[2], row[3]); //t3 = {bbbbbbbbaaaaaaaa}
row[0] = _mm_unpacklo_epi64(t0, t1); // row[0] = {rrrrrrrrrrrrrrrr}
row[1] = _mm_unpackhi_epi64(t0, t1); // etc
row[2] = _mm_unpacklo_epi64(t2, t3);

__m128i v_lo[3], v_hi[3];
for(int i=0; i<3; i++) {
v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128()); // On entrelace chaque row avec des 0, ce qui fait passer les valeurs
v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128()); // de 8bits à 16bits pour pouvoir travailler dessus
}

__m128 v32_lo1[3], v32_hi1[3], v32_lo2[3], v32_hi2[3];
for(int i=0; i<3; i++) {
v32_lo1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_lo[i],_mm_setzero_si128()));
v32_lo2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_lo[i],_mm_setzero_si128()));
v32_hi1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_hi[i],_mm_setzero_si128()));
v32_hi2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_hi[i],_mm_setzero_si128()));
} // On a nos rgb sur 32 bits

__m128i yuv[3]; // {Y, U, V}

__m128i ylo1i = _mm_cvtps_epi32(ylo1);
__m128i ylo2i = _mm_cvtps_epi32(ylo2);
__m128i yhi1i = _mm_cvtps_epi32(yhi1);
__m128i yhi2i = _mm_cvtps_epi32(yhi2);

__m128i ylo = _mm_packus_epi32(ylo1i, ylo2i);
__m128i yhi = _mm_packus_epi32(yhi1i, yhi2i);

yuv[0] = _mm_packus_epi16(ylo, yhi);

ylo1i = _mm_cvtps_epi32(ylo1);
ylo2i = _mm_cvtps_epi32(ylo2);
yhi1i = _mm_cvtps_epi32(yhi1);
yhi2i = _mm_cvtps_epi32(yhi2);

ylo = _mm_packus_epi32(ylo1i, ylo2i);
yhi = _mm_packus_epi32(yhi1i, yhi2i);

yuv[1] = _mm_packus_epi16(ylo, yhi);

ylo1i = _mm_cvtps_epi32(ylo1);
ylo2i = _mm_cvtps_epi32(ylo2);
yhi1i = _mm_cvtps_epi32(yhi1);
yhi2i = _mm_cvtps_epi32(yhi2);

ylo = _mm_packus_epi32(ylo1i, ylo2i);
yhi = _mm_packus_epi32(yhi1i, yhi2i);

yuv[2] = _mm_packus_epi16(ylo, yhi);

_mm_storeu_si128((__m128i*)y,yuv[0]);
_mm_storeu_si128((__m128i*)u,yuv[1]);
_mm_storeu_si128((__m128i*)v,yuv[2]);
}
```

```
+3

source to share

Here is a solution that finds Y, U and V right away and uses only vertical operators

To do this, I first broadcast four pixels like this

``````rgbargbargbargba -> rrrrggggbbbbaaaa
```

```

using inline `_mm_shuffle_epi8`

with mask. I do this down to 16 pixels and then transfer them again.

of

``````row[0] : rrrrggggbbbbaaaa
row[1] : rrrrggggbbbbaaaa
row[2] : rrrrggggbbbbaaaa
ro2[3] : rrrrggggbbbbaaaa
```

```

to

``````row[0] : rrrrrrrrrrrrrrrr
row[1] : gggggggggggggggg
row[2] : bbbbbbbbbbbbbbbb
```

```

This is done in the same way as transferring a 4x4 integer matrix as follows:

``````__m128i t0 = _mm_unpacklo_epi32(row[0], row[1]);
__m128i t1 = _mm_unpacklo_epi32(row[2], row[3]);
__m128i t2 = _mm_unpackhi_epi32(row[0], row[1]);
__m128i t3 = _mm_unpackhi_epi32(row[2], row[3]);
row[0] = _mm_unpacklo_epi64(t0, t1);
row[1] = _mm_unpackhi_epi64(t0, t1);
row[2] = _mm_unpacklo_epi64(t2, t3);
```

```

Now I split each line into high and low and expand to 16 bits like this

``````__m128i v_lo[3], v_hi[3];
for(int i=0; i<3; i++) {
v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128());
v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128());
}
```

```

Finally, I calculate Y, U and V like this:

`````` short m[9] = {66, 129, 25, -38, -74, 112, 112, -94, -18};
__m128i yuv[3];
for(int i=0; i<3; i++) {
__m128i yuv_lo, yuv_hi;
_mm_mullo_epi16(v_lo[0], _mm_set1_epi16(m[3*i+0])),
_mm_mullo_epi16(v_lo[1], _mm_set1_epi16(m[3*i+1]))),
_mm_mullo_epi16(v_lo[2], _mm_set1_epi16(m[3*i+2])));
yuv_lo = _mm_srli_epi16(yuv_lo, 8);

_mm_mullo_epi16(v_hi[0], _mm_set1_epi16(m[3*i+0])),
_mm_mullo_epi16(v_hi[1], _mm_set1_epi16(m[3*i+1]))),
_mm_mullo_epi16(v_hi[2], _mm_set1_epi16(m[3*i+2])));
yuv_hi = _mm_srli_epi16(yuv_hi, 8);

yuv[i] = _mm_packus_epi16(yuv_lo,yuv_hi);
}
```

```

For a working example of this code, see my first answer and function `rgba2yuv_SSE`

.

+2

source

Here is a solution based on OP and Paul R's comments. The inner one `_mm_maddubs_epi16`

requires the second parameter to be signed, which is a problem for the factor `129`

`g`

. However, we can get around this by doing this

``````y = ((66-64)*r + (129-64)*g + (25-64)*b + -64*a) + (64*r + 64*g + 64*b + 64*a)
= (2*r + 65*g + -39*b -64*a) + 64(r + g + a)
```

```

Using this, we only need 16 bit integers, and we can compute 16 `y`

bytes at a time like this:

Note that I originally used 128, but this caused an overflow with `255*((25-128)-128)<-32768`

.

``````__m128i yk = _mm_set1_epi32(0xc0d94102); -64,-39,64,2
__m128i y4[4];
for(int i=0; i<4; i++) {
t2 = _mm_slli_epi16(t2, 6);  //multiply by 64
}
short tmp[8];
_mm_storeu_si128((__m128i*)tmp, y4[0]);

y8_lo = _mm_srli_epi16(y8_lo, 8);

y8_hi = _mm_srli_epi16(y8_hi, 8);

__m128i y16 = _mm_packus_epi16(y8_lo,y8_hi);
```

```

Here's some code to show it works. I compared the result with the formula (with changes) from how to do rgb yuv conversion in C / C ++ , namely:

``````#define CLIP(X) ( (X) > 255 ? 255 : (X) < 0 ? 0 : X)
#define RGB2Y(R, G, B) CLIP(( (  66 * (0xff & R) + 129 * (0xff & G) +  25 * (0xff & B) + 128) >> 8) +  16)
```

```

Code:

``````#include <stdio.h>
#include <x86intrin.h>
#include <stdlib.h>

#define CLIP(X) ( (X) > 255 ? 255 : (X) < 0 ? 0 : X)
#define RGB2Y(R, G, B) CLIP(( (  66 * (0xff & R) + 129 * (0xff & G) +  25 * (0xff & B) + 128) >> 8) +  16)

void rgba2y_SSE_v1(char *a, char *b) {
__m128i yk = _mm_setr_epi16(66,129,25,0, 66,129,25,0);
__m128i out[4];
for(int i=0; i<4; i++) {
__m128i a4, lo, hi;
lo = _mm_unpacklo_epi8(a4,_mm_setzero_si128());
hi = _mm_unpackhi_epi8(a4,_mm_setzero_si128());

out[i] = _mm_unpackhi_epi64(lo,hi);
}
__m128i out_lo = _mm_packus_epi32(out[0], out[1]);
__m128i out_hi = _mm_packus_epi32(out[2], out[3]);

out_lo = _mm_srli_epi16(out_lo, 8);

out_hi = _mm_srli_epi16(out_hi, 8);

__m128i y16 = _mm_packus_epi16(out_lo,out_hi);
_mm_storeu_si128((__m128i*)b,y16);
}

void rgba2y_SSE_v2(char *a, char *b) {
__m128i yk = _mm_set1_epi32(0xc0d94102);
__m128i y4[4];
for(int i=0; i<4; i++) {
t2 = _mm_slli_epi16(t2, 6);
}
short tmp[8];
_mm_storeu_si128((__m128i*)tmp, y4[0]);

y8_lo = _mm_srli_epi16(y8_lo, 8);

y8_hi = _mm_srli_epi16(y8_hi, 8);

__m128i y16 = _mm_packus_epi16(y8_lo,y8_hi);
_mm_storeu_si128((__m128i*)b,y16);
}

void rgba2yuv_SSE(char *a, char *b) {
__m128i mask = _mm_setr_epi8(0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f);
short m[9] = {66, 129, 25, -38, -74, 112, 112, -94, -18};

__m128i row[4];
for(int i=0; i<4; i++) {
}

__m128i t0 = _mm_unpacklo_epi32(row[0], row[1]);
__m128i t1 = _mm_unpacklo_epi32(row[2], row[3]);
__m128i t2 = _mm_unpackhi_epi32(row[0], row[1]);
__m128i t3 = _mm_unpackhi_epi32(row[2], row[3]);
row[0] = _mm_unpacklo_epi64(t0, t1);
row[1] = _mm_unpackhi_epi64(t0, t1);
row[2] = _mm_unpacklo_epi64(t2, t3);

__m128i v_lo[3], v_hi[3];
for(int i=0; i<3; i++) {
v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128());
v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128());
}

__m128i yuv[3];
for(int i=0; i<3; i++) {
__m128i yuv_lo, yuv_hi;
_mm_mullo_epi16(v_lo[0], _mm_set1_epi16(m[3*i+0])),
_mm_mullo_epi16(v_lo[1], _mm_set1_epi16(m[3*i+1]))),
_mm_mullo_epi16(v_lo[2], _mm_set1_epi16(m[3*i+2])));
yuv_lo = _mm_srli_epi16(yuv_lo, 8);

_mm_mullo_epi16(v_hi[0], _mm_set1_epi16(m[3*i+0])),
_mm_mullo_epi16(v_hi[1], _mm_set1_epi16(m[3*i+1]))),
_mm_mullo_epi16(v_hi[2], _mm_set1_epi16(m[3*i+2])));
yuv_hi = _mm_srli_epi16(yuv_hi, 8);

yuv[i] = _mm_packus_epi16(yuv_lo,yuv_hi);
}
_mm_storeu_si128((__m128i*)b,yuv[0]);
}

int main(void) {
char rgba[64];
char y1[16], y2[16], yuv[48];
for(int i=0; i<64; i++) rgba[i] = rand()%256;
rgba2y_SSE_v1(rgba,y1);
rgba2y_SSE_v2(rgba,y2);
rgba2yuv_SSE(rgba,yuv);

printf("RGB2Y: "); for(int i=0; i<16; i++) printf("%x ", 0xff & RGB2Y(rgba[4*i+0], rgba[4*i+1], rgba[4*i+2])); printf("\n");
printf("SSE_v1 "); for(int i=0; i<16; i++) printf("%x ", 0xff & y1[i]); printf("\n");
printf("SSE_v2 "); for(int i=0; i<16; i++) printf("%x ", 0xff & y2[i]); printf("\n");
printf("SSE_v3 "); for(int i=0; i<16; i++) printf("%x ", 0xff & yuv[i]); printf("\n");

}
```

```

Output:

``````RGB2Y: 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d
SSE_v1 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d
SSE_v2 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d
SSE_v3 99 ad 94 e3 9a a2 60 81 45 59 49 a5 aa 9b 60 4d
```

```
+2

source

All Articles