Are ARM and Intel using integrated graphics for AES?

I am trying to slice the ARMv8 AES implementation using built-in functions. I have a C ++ implementation and I have an Intel built-in implementation.

The assumptions should be equivalent, so I'm trying to use Intel as my plan for ARMv8. There are some differences, but they are taken into account. The problem is I have different results.

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);
    }
    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);
    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);
    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
    data = _mm_aesenclast_si128(data, rdkeys[rounds]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}

      

At this point, I am trying to do a count of side steps. I use the same set of round keys for both implementations:

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));

      

However, I arrive at different results. This is what the landfills created:

Intel AES-NI :

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
...

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69

      

ARMv8 AES :

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9
...
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9

      

I keep scratching my head over the results. Adding more printf does not help identify the problem. I'm starting to think that Intel and ARM intrinsics are using different connection graphs.

Do ARM and Intel use integrated graphics for AES?


Below is an image from an article by Cynthia Kruchfield . It covers the mappings between Intel and ARM native functions.

enter image description here


Below is the complete program. The command lines for creating them are also listed.

Intel

g++ -Wall -maes aes-test.cxx -o aes-test.exe

      

AEMv8

 g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe

      

Program

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__ARM_FEATURE_CRYPTO)
# include <arm_neon.h>
# include <arm_acle.h>
#elif defined(__AES__)
# include <wmmintrin.h>
# include <emmintrin.h>
#endif

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0]))

static const unsigned int ROUNDS=10;
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);

void Print(const char* label, const Byte *in, size_t len, bool lf=false)
{
    if (label)
        printf("%s: ", label);

    for (size_t i=0; in && i<len; ++i)
        printf("%02X ", in[i]);    
    printf("\n");

    if (lf)
        printf("\n");
}

int main(int argc, char* argv[])
{
    Byte cipher[16], recover[16];
    const Byte plain[16] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
    };

    // Avoid subkey scheduling at this point
    RoundKey rdkeys[ROUNDS+1];
    for (size_t i=0; i<COUNTOF(rdkeys); ++i)
        memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i]));

    AES_encrypt(plain, cipher, rdkeys, ROUNDS);

    return 0;
}

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
    Print("In", in, 16);

#if defined(__ARM_FEATURE_CRYPTO)

    // Load the block
    uint8x16_t data = vld1q_u8(in);

    Print("Data (in)", (Byte*)&data, 16, true);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }

    Print("Key", (Byte*)&rdkeys[i], 16);

    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);

    Print("Data", (Byte*)&data, 16, true);

    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);

    Print("Data (xor)", (Byte*)&data, 16);

    // Store the output data
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);

    Print("Data (in)", (Byte*)&data, 16);

    data = _mm_xor_si128(data, rdkeys[0]);

    Print("Key", (Byte*)&rdkeys[0], 16);
    Print("Data (xor)", (Byte*)&data, 16, true);

    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);

    Print("Key", (Byte*)&rdkeys[rounds-1], 16);
    Print("Data", (Byte*)&data, 16, true);

    data = _mm_aesenclast_si128(data, rdkeys[rounds]);

    Print("Key", (Byte*)&rdkeys[rounds], 16);
    Print("Data", (Byte*)&data, 16, true);

    _mm_storeu_si128((__m128i*)out, data);

#endif

    Print("Out", out, 16);
}

      

+3


source to share


1 answer


Do ARM and Intel use integrated graphics for AES?

The answer seems to be yes. I still need to test real keyword scheduling, but I was able to get the same result as for Intel and ARMv8 using the same key plot.

It looks like Crutchfield was one by one in the control implementation. He had to use rounds-1

, not rounds

as a contour control. This meant that I was testing ARMv8 with 11 rounds, not 10. I should have suspected this when the ARMv8 code produced F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9

instead F9 F9 ... F9 F9

.



Here's the updated code:

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    unsigned int i;
    for (i=0; i<rounds-1; ++i)
    {
        data = vaeseq_u8(data, rdkeys[i]);
        data = vaesmcq_u8(data);
    }

    data = vaeseq_u8(data, rdkeys[i++]);
    data = veorq_u8(data, rdkeys[i]);

    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);

    unsigned int i;
    for (i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }

    data = _mm_aesenc_si128(data, rdkeys[i++]);
    data = _mm_aesenclast_si128(data, rdkeys[i]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}

      

+2


source







All Articles