Are ARM and Intel using integrated graphics for AES?
I am trying to slice the ARMv8 AES implementation using built-in functions. I have a C ++ implementation and I have an Intel built-in implementation.
The assumptions should be equivalent, so I'm trying to use Intel as my plan for ARMv8. There are some differences, but they are taken into account. The problem is I have different results.
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)
uint8x16_t data = vld1q_u8(in);
// AES encryption with ARM intrinsics:
// rnds-1 (9 for AES128) cycles of AES:
// (Add, Shift, Sub) plus Mix Columns
unsigned int i;
for (i=0; i<rounds; ++i)
{
// AES single round encryption
data = vaeseq_u8(data, rdkeys[i]);
// AES mix columns
data = vaesmcq_u8(data);
}
// One round of encryption: AES, no Mix Columns
data = vaeseq_u8(data, rdkeys[i++]);
// Final Add (bitwise Xor)
data = veorq_u8(data, rdkeys[i]);
vst1q_u8(out, data);
#elif defined(__AES__)
__m128i data = _mm_loadu_si128((const __m128i*)in);
data = _mm_xor_si128(data, rdkeys[0]);
for (unsigned int i=1; i<rounds-1; ++i)
{
data = _mm_aesenc_si128(data, rdkeys[i]);
}
data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
data = _mm_aesenclast_si128(data, rdkeys[rounds]);
_mm_storeu_si128((__m128i*)out, data);
#endif
}
At this point, I am trying to do a count of side steps. I use the same set of round keys for both implementations:
#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif
// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));
However, I arrive at different results. This is what the landfills created:
Intel AES-NI :
In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF ... Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 ... Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 ... Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
ARMv8 AES :
In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF ... Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 ... Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 ... Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9
I keep scratching my head over the results. Adding more printf does not help identify the problem. I'm starting to think that Intel and ARM intrinsics are using different connection graphs.
Do ARM and Intel use integrated graphics for AES?
Below is an image from an article by Cynthia Kruchfield . It covers the mappings between Intel and ARM native functions.
Below is the complete program. The command lines for creating them are also listed.
Intel
g++ -Wall -maes aes-test.cxx -o aes-test.exe
AEMv8
g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe
Program
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#if defined(__ARM_FEATURE_CRYPTO)
# include <arm_neon.h>
# include <arm_acle.h>
#elif defined(__AES__)
# include <wmmintrin.h>
# include <emmintrin.h>
#endif
#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif
#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0]))
static const unsigned int ROUNDS=10;
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void Print(const char* label, const Byte *in, size_t len, bool lf=false)
{
if (label)
printf("%s: ", label);
for (size_t i=0; in && i<len; ++i)
printf("%02X ", in[i]);
printf("\n");
if (lf)
printf("\n");
}
int main(int argc, char* argv[])
{
Byte cipher[16], recover[16];
const Byte plain[16] = {
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
};
// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i]));
AES_encrypt(plain, cipher, rdkeys, ROUNDS);
return 0;
}
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
Print("In", in, 16);
#if defined(__ARM_FEATURE_CRYPTO)
// Load the block
uint8x16_t data = vld1q_u8(in);
Print("Data (in)", (Byte*)&data, 16, true);
// AES encryption with ARM intrinsics:
// rnds-1 (9 for AES128) cycles of AES:
// (Add, Shift, Sub) plus Mix Columns
unsigned int i;
for (i=0; i<rounds; ++i)
{
// AES single round encryption
data = vaeseq_u8(data, rdkeys[i]);
// AES mix columns
data = vaesmcq_u8(data);
Print("Key", (Byte*)&rdkeys[i], 16);
Print("Data", (Byte*)&data, 16, true);
}
Print("Key", (Byte*)&rdkeys[i], 16);
// One round of encryption: AES, no Mix Columns
data = vaeseq_u8(data, rdkeys[i++]);
Print("Data", (Byte*)&data, 16, true);
// Final Add (bitwise Xor)
data = veorq_u8(data, rdkeys[i]);
Print("Data (xor)", (Byte*)&data, 16);
// Store the output data
vst1q_u8(out, data);
#elif defined(__AES__)
__m128i data = _mm_loadu_si128((const __m128i*)in);
Print("Data (in)", (Byte*)&data, 16);
data = _mm_xor_si128(data, rdkeys[0]);
Print("Key", (Byte*)&rdkeys[0], 16);
Print("Data (xor)", (Byte*)&data, 16, true);
for (unsigned int i=1; i<rounds-1; ++i)
{
data = _mm_aesenc_si128(data, rdkeys[i]);
Print("Key", (Byte*)&rdkeys[i], 16);
Print("Data", (Byte*)&data, 16, true);
}
data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
Print("Key", (Byte*)&rdkeys[rounds-1], 16);
Print("Data", (Byte*)&data, 16, true);
data = _mm_aesenclast_si128(data, rdkeys[rounds]);
Print("Key", (Byte*)&rdkeys[rounds], 16);
Print("Data", (Byte*)&data, 16, true);
_mm_storeu_si128((__m128i*)out, data);
#endif
Print("Out", out, 16);
}
source to share
Do ARM and Intel use integrated graphics for AES?
The answer seems to be yes. I still need to test real keyword scheduling, but I was able to get the same result as for Intel and ARMv8 using the same key plot.
It looks like Crutchfield was one by one in the control implementation. He had to use rounds-1
, not rounds
as a contour control. This meant that I was testing ARMv8 with 11 rounds, not 10. I should have suspected this when the ARMv8 code produced F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9
instead F9 F9 ... F9 F9
.
Here's the updated code:
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) { #if defined(__ARM_FEATURE_CRYPTO) uint8x16_t data = vld1q_u8(in); unsigned int i; for (i=0; i<rounds-1; ++i) { data = vaeseq_u8(data, rdkeys[i]); data = vaesmcq_u8(data); } data = vaeseq_u8(data, rdkeys[i++]); data = veorq_u8(data, rdkeys[i]); vst1q_u8(out, data); #elif defined(__AES__) __m128i data = _mm_loadu_si128((const __m128i*)in); data = _mm_xor_si128(data, rdkeys[0]); unsigned int i; for (i=1; i<rounds-1; ++i) { data = _mm_aesenc_si128(data, rdkeys[i]); } data = _mm_aesenc_si128(data, rdkeys[i++]); data = _mm_aesenclast_si128(data, rdkeys[i]); _mm_storeu_si128((__m128i*)out, data); #endif }
source to share