Differences in processor frequency between static and dynamic allocation

My goal is to investigate the cpu time discrepancies I observe between static and dynamic allocation, depending on whether the memory is accessed contiguously or not.

To make this investigation as possible as possible, I ran it with both C ++ and Fortran programs. They are as simple as possible, the main part is to calculate one matrix multiplication of two randomly filled ones. Here is the C ++ code:

#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>

using namespace std;

#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//

class matrix
{
private:
  int n_;
  int m_;
  double *data_;

public:

  matrix();
  ~matrix();

  double* operator[](int i);
  void resize(int n, int m);
  double& operator()(int i, int j);
  const double& operator()(int i, int j) const;
};

matrix::matrix() : n_(0), m_(0), data_(NULL)
{
  return;
}

matrix::~matrix()
{
  if (data_) delete[] data_;
  return;
}

void matrix::resize(int n, int m)
{
  if (data_) delete[] data_;
  n_ = n;
  m_ = m;
  data_ = new double[n_ * m_];
}

inline double& matrix::operator()(int i, int j)
{
  return *(data_ + i * m_ + j);
}

inline const double& matrix::operator()(int i, int j) const
{
  return *(data_ + i * m_ + j);
}
#endif

// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;


//
// Main program.
//


int main(int argc, char *argv[])
{
  cout << "optflag " << optflag;

#ifdef ALLOCATION_DYNAMIC
  int n = N;
  matrix cc1;
  matrix cc2;
  matrix cc3;
#else
  const int n = N;

  // It is necessary to specify the static keyword
  // because the default is "automatic", so that
  // data is entirely put on the stack which quickly
  // get overflowed with greater N values.
  static double cc1[N][N];
  static double cc2[N][N];
  static double cc3[N][N];
#endif

  cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
  cout << "dynamic";
  if (argc > 1)
    {
      istringstream iss(argv[1]);
      iss >> n;
    }

  cc1.resize(n, n);
  cc2.resize(n, n);
  cc3.resize(n, n);
#else
  cout << "static";
#endif
  cout << " N " << n << flush;

  // Init.
  string seed = SEED;
  std::seed_seq seed_sequence (seed.begin(), seed.end());

  // Standard, 64 bit based, Mersenne Twister random engine.
  std::mt19937_64 generator (seed_sequence);

  // Random number between [0, 1].
  std::uniform_real_distribution<double> random_unity(double(0), double(1));

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
    {
#ifdef ALLOCATION_DYNAMIC
      cc1(i, j) = random_unity(generator);
      cc2(i, j) = random_unity(generator);
      cc3(i, j) = double(0);
#else
      cc1[i][j] = random_unity(generator);
      cc2[i][j] = random_unity(generator);
      cc3[i][j] = double(0);
#endif
    }

  clock_t cpu_begin = clock();
  auto wall_begin = std::chrono::high_resolution_clock::now();

  cout << " transpose ";
#ifdef TRANSPOSE
  cout << "yes";
  // Transpose.

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < i; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        double tmp = cc2(i, j);
        cc2(i, j) = cc2(j, i);
        cc2(j, i) = tmp;
#else
        double tmp = cc2[i][j];
        cc2[i][j] = cc2[j][i];
        cc2[j][i] = tmp;
#endif
      }
#else
  cout << "no";
#endif
  cout << flush;

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
        }

  clock_t cpu_end = clock();
  auto wall_end = std::chrono::high_resolution_clock::now();

  double sum(0);
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        sum += cc3(i, j);
#else
        sum += cc3[i][j];
#endif
      }

  sum /= double(n * n);

  cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
       << " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
       << " sum " << setprecision(16) << sum << endl;

  return 0;
}

      

Here is the Fortran code:

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:,:), allocatable :: cc1
  double precision, dimension(:,:), allocatable :: cc2
  double precision, dimension(:,:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n,n) :: cc1
  double precision, dimension(n,n) :: cc2
  double precision, dimension(n,n) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

#ifdef TRANSPOSE
  double precision :: tmp
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n, n))
  allocate(cc2(n, n))
  allocate(cc3(n, n))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n
     do j = 1, n
        call random_number(harvest)
        cc1(i, j) = harvest(1)
        cc2(i, j) = harvest(2)
        cc3(i, j) = dble(0)
     enddo
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
  write(6, FMT = '(A)', ADVANCE = 'NO') "yes"

  ! Transpose.
  do j = 1, n
     do i = 1, j - 1
        tmp = cc1(i, j)
        cc1(i, j) = cc1(j, i)
        cc1(j, i) = tmp
     enddo
  enddo
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
#ifdef TRANSPOSE
           cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n
     do i = 1, n
        sum = sum + cc3(i, j)
     enddo
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

      

I tried to make both programs as similar as possible given that C / C ++ is string order whereas Fortran is column order.

Whenever possible matrices are read contiguously, an exception is matrix multiplication because when doing C = A x BA is usually read row by row, while B is read column.

Both programs can be compiled either by including one of the matrices A or B, depending on the language, which cannot be accessed sequentially, or by transferring the matrix A or B so that it is then read contiguously during matrix multiplication, which is achieved by passing a pre-programming instruction TRANSPOSE.

The following lines show all the details of the compilation process ((GCC) 4.8.1):

gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3    -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90

gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90

g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

      

These four lines create four programs in which matrices A or B are initially wrapped. The N preprocess command initializes the default matrix size to be known at compile time using static fields. That is, all programs are compiled with the highest degree of optimization (O3) that I know so far.

I ran all the generated programs for different matrix sizes from 1000 to 5000. The results are shown in the following figures: one for each case (transposition or not):

Test with transpose

Test without transpose

Host system

Intel (R) Xeon (R) CPU E5-2670 0 @ 2.60 GHz

and the stack size (ulimit -s) 10240.

For each point, I ran the same program several times until the standard deviation of the CPU time was negligible compared to the average. Squares and circles stand respectively for Fortran and C ++, red and green for dynamic or static.

In the transposition test, the computation time is very close, especially the main difference occurs from the language (Fortran vs. C ++), the dynamic and static distribution is practically the same. However, static allocation seems to be faster, especially for C ++.

In the test without transposition, the computation time is significantly longer, which is expected, since it is slower to access memory not sequentially, but the processor time behaves differently than before:

  • there seems to be some "instability" between the 1600 and 3400 matrix sizes,
  • Language makes no distinction
  • dynamic and static allocation makes one major inconsistency in any language.

I would like to understand what is going on in the test without transposition:

  • Why does switching from static to dynamic allocation increase CPU time by an average of 50% (average per N) for C ++ and Fortran?
  • Are there ways to overcome this with some compilation options?
  • Why are we seeing some kind of instability compared to the smooth behavior of the transpose test? Indeed, there is a slight increase for some matrix sizes: 1600, 2400, 2800, 3200, 4000, 4800, all of which (except 2800) are divisible by 8 (8 x 200, 300, 400, 500, 600). Do you see the reasons for this?

Your help would be much appreciated as the team I work for is facing the same problem: a significant increase in CPU time when switching from static to dynamic allocation in a (much larger) Fortran program.

+3


source to share


2 answers


The most important part should be the knowledge available to the compiler.

The static version has a fixed array size that can be used by the compiler for better optimization. For example. the row spacing of your matrix is ​​fixed ( cc3(n,1)

is next to cc3(1,2)

in Fortran memory), while a dynamic array may be indented somewhat (an element may exist cc3(n+1,1)

. In fact, by looking at the output -fopt-info-optimized

, we can see that the loop in l.95 is only optimized in static case.



To test this, I modified your program to use linear arrays to represent matrices. The timing of the program I had no significant difference in time between static and dynamic allocation, and version 2d with the optimal order of the loop ran at the same speed.

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:), allocatable :: cc1
  double precision, dimension(:), allocatable :: cc2
  double precision, dimension(:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n**2) :: cc1
  double precision, dimension(n**2) :: cc2
  double precision, dimension(n**2) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n**2))
  allocate(cc2(n**2))
  allocate(cc3(n**2))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n**2
    call random_number(harvest)
    cc1(i) = harvest(1)
    cc2(i) = harvest(2)
    cc3(i) = dble(0)
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n**2
        sum = sum + cc3(i)
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

      

+1


source


It is clear that the temporary CPU discrepancy between static and dynamic allocation in the notranspose test is due to

  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
        enddo
     enddo
  enddo

      

for Fortran 90 program and

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
  #if defined(ALLOCATION_DYNAMIC)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
  #else
          cc3[i][j] += cc1[i][k] * cc2[k][j];
  #endif
        }

      



for C ++.

The compiler is capable of doing deeper optimizations ( -fopt-info-optimize

) when using static allocation, in which case it outputs (like F90 / C ++):

Vectorization in src / test.cpp: 163

src / test.cpp: 163: note: LOOP VECTORIZED.

src / test.cpp: 163: note: OUTER LOP VECTORED.

It does nothing for dynamic allocation, I am very surprised at this, I do not understand why the compiler can optimize this non-contiguous memory access ( cc3[k][j]

) with static allocation and not with dynamic allocation ( cc3(k, j)

).

+1


source







All Articles