Differences in processor frequency between static and dynamic allocation
My goal is to investigate the cpu time discrepancies I observe between static and dynamic allocation, depending on whether the memory is accessed contiguously or not.
To make this investigation as possible as possible, I ran it with both C ++ and Fortran programs. They are as simple as possible, the main part is to calculate one matrix multiplication of two randomly filled ones. Here is the C ++ code:
#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>
using namespace std;
#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//
class matrix
{
private:
int n_;
int m_;
double *data_;
public:
matrix();
~matrix();
double* operator[](int i);
void resize(int n, int m);
double& operator()(int i, int j);
const double& operator()(int i, int j) const;
};
matrix::matrix() : n_(0), m_(0), data_(NULL)
{
return;
}
matrix::~matrix()
{
if (data_) delete[] data_;
return;
}
void matrix::resize(int n, int m)
{
if (data_) delete[] data_;
n_ = n;
m_ = m;
data_ = new double[n_ * m_];
}
inline double& matrix::operator()(int i, int j)
{
return *(data_ + i * m_ + j);
}
inline const double& matrix::operator()(int i, int j) const
{
return *(data_ + i * m_ + j);
}
#endif
// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;
//
// Main program.
//
int main(int argc, char *argv[])
{
cout << "optflag " << optflag;
#ifdef ALLOCATION_DYNAMIC
int n = N;
matrix cc1;
matrix cc2;
matrix cc3;
#else
const int n = N;
// It is necessary to specify the static keyword
// because the default is "automatic", so that
// data is entirely put on the stack which quickly
// get overflowed with greater N values.
static double cc1[N][N];
static double cc2[N][N];
static double cc3[N][N];
#endif
cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
cout << "dynamic";
if (argc > 1)
{
istringstream iss(argv[1]);
iss >> n;
}
cc1.resize(n, n);
cc2.resize(n, n);
cc3.resize(n, n);
#else
cout << "static";
#endif
cout << " N " << n << flush;
// Init.
string seed = SEED;
std::seed_seq seed_sequence (seed.begin(), seed.end());
// Standard, 64 bit based, Mersenne Twister random engine.
std::mt19937_64 generator (seed_sequence);
// Random number between [0, 1].
std::uniform_real_distribution<double> random_unity(double(0), double(1));
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
cc1(i, j) = random_unity(generator);
cc2(i, j) = random_unity(generator);
cc3(i, j) = double(0);
#else
cc1[i][j] = random_unity(generator);
cc2[i][j] = random_unity(generator);
cc3[i][j] = double(0);
#endif
}
clock_t cpu_begin = clock();
auto wall_begin = std::chrono::high_resolution_clock::now();
cout << " transpose ";
#ifdef TRANSPOSE
cout << "yes";
// Transpose.
for (int i = 0; i < n; ++i)
for (int j = 0; j < i; ++j)
{
#ifdef ALLOCATION_DYNAMIC
double tmp = cc2(i, j);
cc2(i, j) = cc2(j, i);
cc2(j, i) = tmp;
#else
double tmp = cc2[i][j];
cc2[i][j] = cc2[j][i];
cc2[j][i] = tmp;
#endif
}
#else
cout << "no";
#endif
cout << flush;
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
}
clock_t cpu_end = clock();
auto wall_end = std::chrono::high_resolution_clock::now();
double sum(0);
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
sum += cc3(i, j);
#else
sum += cc3[i][j];
#endif
}
sum /= double(n * n);
cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
<< " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
<< " sum " << setprecision(16) << sum << endl;
return 0;
}
Here is the Fortran code:
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:,:), allocatable :: cc1
double precision, dimension(:,:), allocatable :: cc2
double precision, dimension(:,:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n,n) :: cc1
double precision, dimension(n,n) :: cc2
double precision, dimension(n,n) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
#ifdef TRANSPOSE
double precision :: tmp
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n, n))
allocate(cc2(n, n))
allocate(cc3(n, n))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n
do j = 1, n
call random_number(harvest)
cc1(i, j) = harvest(1)
cc2(i, j) = harvest(2)
cc3(i, j) = dble(0)
enddo
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
write(6, FMT = '(A)', ADVANCE = 'NO') "yes"
! Transpose.
do j = 1, n
do i = 1, j - 1
tmp = cc1(i, j)
cc1(i, j) = cc1(j, i)
cc1(j, i) = tmp
enddo
enddo
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
#ifdef TRANSPOSE
cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
enddo
enddo
enddo
sum = dble(0)
do j = 1, n
do i = 1, n
sum = sum + cc3(i, j)
enddo
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
I tried to make both programs as similar as possible given that C / C ++ is string order whereas Fortran is column order.
Whenever possible matrices are read contiguously, an exception is matrix multiplication because when doing C = A x BA is usually read row by row, while B is read column.
Both programs can be compiled either by including one of the matrices A or B, depending on the language, which cannot be accessed sequentially, or by transferring the matrix A or B so that it is then read contiguously during matrix multiplication, which is achieved by passing a pre-programming instruction TRANSPOSE.
The following lines show all the details of the compilation process ((GCC) 4.8.1):
gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90
gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90
g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
These four lines create four programs in which matrices A or B are initially wrapped. The N preprocess command initializes the default matrix size to be known at compile time using static fields. That is, all programs are compiled with the highest degree of optimization (O3) that I know so far.
I ran all the generated programs for different matrix sizes from 1000 to 5000. The results are shown in the following figures: one for each case (transposition or not):
Host system
Intel (R) Xeon (R) CPU E5-2670 0 @ 2.60 GHz
and the stack size (ulimit -s) 10240.
For each point, I ran the same program several times until the standard deviation of the CPU time was negligible compared to the average. Squares and circles stand respectively for Fortran and C ++, red and green for dynamic or static.
In the transposition test, the computation time is very close, especially the main difference occurs from the language (Fortran vs. C ++), the dynamic and static distribution is practically the same. However, static allocation seems to be faster, especially for C ++.
In the test without transposition, the computation time is significantly longer, which is expected, since it is slower to access memory not sequentially, but the processor time behaves differently than before:
- there seems to be some "instability" between the 1600 and 3400 matrix sizes,
- Language makes no distinction
- dynamic and static allocation makes one major inconsistency in any language.
I would like to understand what is going on in the test without transposition:
- Why does switching from static to dynamic allocation increase CPU time by an average of 50% (average per N) for C ++ and Fortran?
- Are there ways to overcome this with some compilation options?
- Why are we seeing some kind of instability compared to the smooth behavior of the transpose test? Indeed, there is a slight increase for some matrix sizes: 1600, 2400, 2800, 3200, 4000, 4800, all of which (except 2800) are divisible by 8 (8 x 200, 300, 400, 500, 600). Do you see the reasons for this?
Your help would be much appreciated as the team I work for is facing the same problem: a significant increase in CPU time when switching from static to dynamic allocation in a (much larger) Fortran program.
source to share
The most important part should be the knowledge available to the compiler.
The static version has a fixed array size that can be used by the compiler for better optimization. For example. the row spacing of your matrix is fixed ( cc3(n,1)
is next to cc3(1,2)
in Fortran memory), while a dynamic array may be indented somewhat (an element may exist cc3(n+1,1)
. In fact, by looking at the output -fopt-info-optimized
, we can see that the loop in l.95 is only optimized in static case.
To test this, I modified your program to use linear arrays to represent matrices. The timing of the program I had no significant difference in time between static and dynamic allocation, and version 2d with the optimal order of the loop ran at the same speed.
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:), allocatable :: cc1
double precision, dimension(:), allocatable :: cc2
double precision, dimension(:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n**2) :: cc1
double precision, dimension(n**2) :: cc2
double precision, dimension(n**2) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n**2))
allocate(cc2(n**2))
allocate(cc3(n**2))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n**2
call random_number(harvest)
cc1(i) = harvest(1)
cc2(i) = harvest(2)
cc3(i) = dble(0)
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
enddo
enddo
enddo
sum = dble(0)
do j = 1, n**2
sum = sum + cc3(i)
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
source to share
It is clear that the temporary CPU discrepancy between static and dynamic allocation in the notranspose test is due to
do j = 1, n
do i = 1, n
do k = 1, n
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
enddo
enddo
enddo
for Fortran 90 program and
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#else
cc3[i][j] += cc1[i][k] * cc2[k][j];
#endif
}
for C ++.
The compiler is capable of doing deeper optimizations ( -fopt-info-optimize
) when using static allocation, in which case it outputs (like F90 / C ++):
Vectorization in src / test.cpp: 163
src / test.cpp: 163: note: LOOP VECTORIZED.
src / test.cpp: 163: note: OUTER LOP VECTORED.
It does nothing for dynamic allocation, I am very surprised at this, I do not understand why the compiler can optimize this non-contiguous memory access ( cc3[k][j]
) with static allocation and not with dynamic allocation ( cc3(k, j)
).
source to share