Multiplication of quaternions with gcc vector expansion

Question

Multiplication of quaternions with gcc vector expansion

I've looked at the tricks How to multiply two quaternions with minimal instructions? and was dismayed about my gcc implementation being incomplete:

template <typename T> struct quat;

template <> struct quat<float> { float __attribute__((vector_size(16))) data_; };

inline quat<float> operator*(quat<float> const& l, quat<float> const& r) noexcept
{
//  l(0)r(3) + l(1)r(2) - l(2)r(1) + l(3)r(0)
// -l(0)r(2) + l(1)r(3) + l(2)r(0) + l(3)r(1)
//  l(0)r(1) - l(1)r(0) + l(2)r(3) + l(3)r(2)
// -l(0)r(0) - l(1)r(1) - l(2)r(2) + l(3)r(3)
  using int_value_type = ::std::int32_t;
  using int_vector_type = ::std::int32_t __attribute__((vector_size(16)));

  auto const t1(
    l.data_ *
    __builtin_shuffle(r.data_, int_vector_type{3, 3, 3, 3})
  );
  auto const t2(
    __builtin_shuffle(l.data_, int_vector_type{1, 2, 0, 2}) *
    __builtin_shuffle(r.data_, int_vector_type{2, 0, 1, 2})
  );
  auto const t3(
    __builtin_shuffle(l.data_, int_vector_type{3, 3, 3, 1}) *
    __builtin_shuffle(r.data_, int_vector_type{0, 1, 2, 1})
  );
  auto const t4(
    __builtin_shuffle(l.data_, int_vector_type{2, 0, 1, 0}) *
    __builtin_shuffle(r.data_, int_vector_type{1, 2, 0, 0})
  );

  constexpr int_vector_type const mask{
    0, 0, 0, 1 << 8 * sizeof(int_value_type) - 1
  };

  return {
    t1 +
    decltype(t2)(int_vector_type(t2) ^ mask) +
    decltype(t3)(int_vector_type(t3) ^ mask) -
    t4
  };
}

This means 4 MUL, 2 ADD, 1 SUB and 2 XOR. Is there some trick in gcc vector extensions or math that I missed that could bring my implementation closer to being more optimal?

EDIT: asm code with -O3 -ffast-math

Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
   0x0000000000400980 <+0>:     movaps (%rsi),%xmm2
   0x0000000000400983 <+3>:     movaps (%rdi),%xmm1
   0x0000000000400986 <+6>:     movaps %xmm2,%xmm0
   0x0000000000400989 <+9>:     movaps %xmm2,%xmm4
   0x000000000040098c <+12>:    movaps %xmm1,%xmm3
   0x000000000040098f <+15>:    shufps $0xff,%xmm2,%xmm0
   0x0000000000400993 <+19>:    shufps $0x9,%xmm2,%xmm4
   0x0000000000400997 <+23>:    shufps $0x12,%xmm1,%xmm3
   0x000000000040099b <+27>:    mulps  %xmm1,%xmm0
   0x000000000040099e <+30>:    mulps  %xmm4,%xmm3
   0x00000000004009a1 <+33>:    movaps %xmm0,%xmm4
   0x00000000004009a4 <+36>:    movaps %xmm1,%xmm0
   0x00000000004009a7 <+39>:    subps  %xmm3,%xmm4
   0x00000000004009aa <+42>:    movaps %xmm2,%xmm3
   0x00000000004009ad <+45>:    shufps $0x7f,%xmm1,%xmm0
   0x00000000004009b1 <+49>:    shufps $0x64,%xmm2,%xmm3
   0x00000000004009b5 <+53>:    shufps $0x89,%xmm1,%xmm1
   0x00000000004009b9 <+57>:    shufps $0x92,%xmm2,%xmm2
   0x00000000004009bd <+61>:    mulps  %xmm3,%xmm0
   0x00000000004009c0 <+64>:    movaps 0xa9(%rip),%xmm3        # 0x400a70
   0x00000000004009c7 <+71>:    mulps  %xmm2,%xmm1
   0x00000000004009ca <+74>:    xorps  %xmm3,%xmm0
   0x00000000004009cd <+77>:    xorps  %xmm3,%xmm1
   0x00000000004009d0 <+80>:    addps  %xmm0,%xmm1
   0x00000000004009d3 <+83>:    addps  %xmm1,%xmm4
   0x00000000004009d6 <+86>:    movaps %xmm4,%xmm0
   0x00000000004009d9 <+89>:    retq

EDIT2: With Marc and clang's suggestion, the code becomes:

Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
   0x0000000000400ae0 <+0>:     movaps (%rdi),%xmm1
   0x0000000000400ae3 <+3>:     movdqa (%rsi),%xmm2
   0x0000000000400ae7 <+7>:     pshufd $0xff,%xmm2,%xmm3
   0x0000000000400aec <+12>:    mulps  %xmm1,%xmm3
   0x0000000000400aef <+15>:    pshufd $0x89,%xmm1,%xmm0
   0x0000000000400af4 <+20>:    pshufd $0x92,%xmm2,%xmm4
   0x0000000000400af9 <+25>:    mulps  %xmm0,%xmm4
   0x0000000000400afc <+28>:    pshufd $0x7f,%xmm1,%xmm5
   0x0000000000400b01 <+33>:    pshufd $0x64,%xmm2,%xmm0
   0x0000000000400b06 <+38>:    mulps  %xmm5,%xmm0
   0x0000000000400b09 <+41>:    pshufd $0x12,%xmm1,%xmm1
   0x0000000000400b0e <+46>:    pshufd $0x9,%xmm2,%xmm2
   0x0000000000400b13 <+51>:    mulps  %xmm1,%xmm2
   0x0000000000400b16 <+54>:    addps  %xmm4,%xmm0
   0x0000000000400b19 <+57>:    xorps  0xc0(%rip),%xmm0        # 0x400be0
   0x0000000000400b20 <+64>:    addps  %xmm3,%xmm0
   0x0000000000400b23 <+67>:    subps  %xmm2,%xmm0
   0x0000000000400b26 <+70>:    retq

It's not bad.

+3

c ++ gcc simd quaternions

user1095108 27 Aug 14 at 10:40

source to share

1 answer

Anton Savin · Answer 1 · 2014-08-27T11:56:27+0000

You can use essentially the same code as in the question you referenced (to be honest, I haven't tested it, but I believe it should work)

#include <pmmintrin.h>

inline quat<float> operator*(quat<float> const& xyzw, quat<float> const& abcd) noexcept
{
    /* The product of two quaternions is:                                 */
    /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */

    __m128 wzyx = _mm_shuffle_ps(xyzw.data_, xyzw.data_, _MM_SHUFFLE(0,1,2,3));
    __m128 baba = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(0,1,0,1));
    __m128 dcdc = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(2,3,2,3));

    /* variable names below are for parts of componens of result (X,Y,Z,W) */
    /* nX stands for -X and similarly for the other components             */

    /* znxwy  = (xb - ya, zb - wa, wd - zc, yd - xc) */
    __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw.data_, baba), _mm_mul_ps(wzyx, dcdc));

    /* xzynw  = (xd + yc, zd + wc, wb + za, yb + xa) */
    __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw.data_, dcdc), _mm_mul_ps(wzyx, baba));

    __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
                                _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));

    /* now we only need to shuffle the components in place and return the result      */
    return quat<float>{_mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0))};

    /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions   */
}

Multiplication of quaternions with gcc vector expansion

More articles: