Differences in processor frequency between static and dynamic allocation

Question

Differences in processor frequency between static and dynamic allocation

My goal is to investigate the cpu time discrepancies I observe between static and dynamic allocation, depending on whether the memory is accessed contiguously or not.

To make this investigation as possible as possible, I ran it with both C ++ and Fortran programs. They are as simple as possible, the main part is to calculate one matrix multiplication of two randomly filled ones. Here is the C ++ code:

#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>

using namespace std;

#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//

class matrix
{
private:
  int n_;
  int m_;
  double *data_;

public:

  matrix();
  ~matrix();

  double* operator[](int i);
  void resize(int n, int m);
  double& operator()(int i, int j);
  const double& operator()(int i, int j) const;
};

matrix::matrix() : n_(0), m_(0), data_(NULL)
{
  return;
}

matrix::~matrix()
{
  if (data_) delete[] data_;
  return;
}

void matrix::resize(int n, int m)
{
  if (data_) delete[] data_;
  n_ = n;
  m_ = m;
  data_ = new double[n_ * m_];
}

inline double& matrix::operator()(int i, int j)
{
  return *(data_ + i * m_ + j);
}

inline const double& matrix::operator()(int i, int j) const
{
  return *(data_ + i * m_ + j);
}
#endif

// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;


//
// Main program.
//


int main(int argc, char *argv[])
{
  cout << "optflag " << optflag;

#ifdef ALLOCATION_DYNAMIC
  int n = N;
  matrix cc1;
  matrix cc2;
  matrix cc3;
#else
  const int n = N;

  // It is necessary to specify the static keyword
  // because the default is "automatic", so that
  // data is entirely put on the stack which quickly
  // get overflowed with greater N values.
  static double cc1[N][N];
  static double cc2[N][N];
  static double cc3[N][N];
#endif

  cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
  cout << "dynamic";
  if (argc > 1)
    {
      istringstream iss(argv[1]);
      iss >> n;
    }

  cc1.resize(n, n);
  cc2.resize(n, n);
  cc3.resize(n, n);
#else
  cout << "static";
#endif
  cout << " N " << n << flush;

  // Init.
  string seed = SEED;
  std::seed_seq seed_sequence (seed.begin(), seed.end());

  // Standard, 64 bit based, Mersenne Twister random engine.
  std::mt19937_64 generator (seed_sequence);

  // Random number between [0, 1].
  std::uniform_real_distribution<double> random_unity(double(0), double(1));

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
    {
#ifdef ALLOCATION_DYNAMIC
      cc1(i, j) = random_unity(generator);
      cc2(i, j) = random_unity(generator);
      cc3(i, j) = double(0);
#else
      cc1[i][j] = random_unity(generator);
      cc2[i][j] = random_unity(generator);
      cc3[i][j] = double(0);
#endif
    }

  clock_t cpu_begin = clock();
  auto wall_begin = std::chrono::high_resolution_clock::now();

  cout << " transpose ";
#ifdef TRANSPOSE
  cout << "yes";
  // Transpose.

  for (int i = 0; i < n; ++i)
    for (int j = 0; j < i; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        double tmp = cc2(i, j);
        cc2(i, j) = cc2(j, i);
        cc2(j, i) = tmp;
#else
        double tmp = cc2[i][j];
        cc2[i][j] = cc2[j][i];
        cc2[j][i] = tmp;
#endif
      }
#else
  cout << "no";
#endif
  cout << flush;

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
          cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
        }

  clock_t cpu_end = clock();
  auto wall_end = std::chrono::high_resolution_clock::now();

  double sum(0);
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      {
#ifdef ALLOCATION_DYNAMIC
        sum += cc3(i, j);
#else
        sum += cc3[i][j];
#endif
      }

  sum /= double(n * n);

  cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
       << " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
       << " sum " << setprecision(16) << sum << endl;

  return 0;
}

Here is the Fortran code:

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:,:), allocatable :: cc1
  double precision, dimension(:,:), allocatable :: cc2
  double precision, dimension(:,:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n,n) :: cc1
  double precision, dimension(n,n) :: cc2
  double precision, dimension(n,n) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

#ifdef TRANSPOSE
  double precision :: tmp
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n, n))
  allocate(cc2(n, n))
  allocate(cc3(n, n))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n
     do j = 1, n
        call random_number(harvest)
        cc1(i, j) = harvest(1)
        cc2(i, j) = harvest(2)
        cc3(i, j) = dble(0)
     enddo
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
  write(6, FMT = '(A)', ADVANCE = 'NO') "yes"

  ! Transpose.
  do j = 1, n
     do i = 1, j - 1
        tmp = cc1(i, j)
        cc1(i, j) = cc1(j, i)
        cc1(j, i) = tmp
     enddo
  enddo
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
#ifdef TRANSPOSE
           cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n
     do i = 1, n
        sum = sum + cc3(i, j)
     enddo
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

I tried to make both programs as similar as possible given that C / C ++ is string order whereas Fortran is column order.

Whenever possible matrices are read contiguously, an exception is matrix multiplication because when doing C = A x BA is usually read row by row, while B is read column.

Both programs can be compiled either by including one of the matrices A or B, depending on the language, which cannot be accessed sequentially, or by transferring the matrix A or B so that it is then read contiguously during matrix multiplication, which is achieved by passing a pre-programming instruction TRANSPOSE.

The following lines show all the details of the compilation process ((GCC) 4.8.1):

gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3    -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90

gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90

g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp

These four lines create four programs in which matrices A or B are initially wrapped. The N preprocess command initializes the default matrix size to be known at compile time using static fields. That is, all programs are compiled with the highest degree of optimization (O3) that I know so far.

I ran all the generated programs for different matrix sizes from 1000 to 5000. The results are shown in the following figures: one for each case (transposition or not):

Test with transpose

Test without transpose

Host system

Intel (R) Xeon (R) CPU E5-2670 0 @ 2.60 GHz

and the stack size (ulimit -s) 10240.

For each point, I ran the same program several times until the standard deviation of the CPU time was negligible compared to the average. Squares and circles stand respectively for Fortran and C ++, red and green for dynamic or static.

In the transposition test, the computation time is very close, especially the main difference occurs from the language (Fortran vs. C ++), the dynamic and static distribution is practically the same. However, static allocation seems to be faster, especially for C ++.

In the test without transposition, the computation time is significantly longer, which is expected, since it is slower to access memory not sequentially, but the processor time behaves differently than before:

there seems to be some "instability" between the 1600 and 3400 matrix sizes,
Language makes no distinction
dynamic and static allocation makes one major inconsistency in any language.

I would like to understand what is going on in the test without transposition:

Why does switching from static to dynamic allocation increase CPU time by an average of 50% (average per N) for C ++ and Fortran?
Are there ways to overcome this with some compilation options?
Why are we seeing some kind of instability compared to the smooth behavior of the transpose test? Indeed, there is a slight increase for some matrix sizes: 1600, 2400, 2800, 3200, 4000, 4800, all of which (except 2800) are divisible by 8 (8 x 200, 300, 400, 500, 600). Do you see the reasons for this?

Your help would be much appreciated as the team I work for is facing the same problem: a significant increase in CPU time when switching from static to dynamic allocation in a (much larger) Fortran program.

+3

c ++ static dynamic fortran allocation

deb2014 Jul 20. 15 at 10:47

source to share

2 answers

Stefan · Answer 1 · 2015-07-22T17:12:13+0000

The most important part should be the knowledge available to the compiler.

The static version has a fixed array size that can be used by the compiler for better optimization. For example. the row spacing of your matrix is fixed ( cc3(n,1)

is next to cc3(1,2)

in Fortran memory), while a dynamic array may be indented somewhat (an element may exist cc3(n+1,1)

. In fact, by looking at the output -fopt-info-optimized

, we can see that the loop in l.95 is only optimized in static case.

To test this, I modified your program to use linear arrays to represent matrices. The timing of the program I had no significant difference in time between static and dynamic allocation, and version 2d with the optimal order of the loop ran at the same speed.

program Test

#ifdef ALLOCATION_DYNAMIC
  integer :: n = N
  double precision, dimension(:), allocatable :: cc1
  double precision, dimension(:), allocatable :: cc2
  double precision, dimension(:), allocatable :: cc3
#else
  integer, parameter :: n = N
  double precision, dimension(n**2) :: cc1
  double precision, dimension(n**2) :: cc2
  double precision, dimension(n**2) :: cc3
#endif

  character(len = 5) :: optflag = OPTFLAG
  character(len = 8)  :: time = SEED

#ifdef ALLOCATION_DYNAMIC
  character(len = 10) :: arg
#endif

  double precision :: sum
  double precision :: cpu_start, cpu_end, wall_start, wall_end
  integer :: clock_reading, clock_rate, clock_max

  integer :: i, j, k, s
  double precision, dimension(2) :: harvest
  integer, dimension(:), allocatable :: seed

  write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
  write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
  write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
  if (iargc().gt.0) then
     call getarg(1, arg)
     read(arg, '(I8)') n  
  end if
#else
  write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
  write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n

#ifdef ALLOCATION_DYNAMIC
  allocate(cc1(n**2))
  allocate(cc2(n**2))
  allocate(cc3(n**2))
#endif

  ! Init.
  call random_seed(size = s)
  allocate(seed(s))
  seed = 0
  read(time(1:2), '(I2)') seed(1)
  read(time(4:5), '(I2)') seed(2)
  read(time(7:8), '(I2)') seed(3)

  call random_seed(put = seed)
  deallocate(seed)

  do i = 1, n**2
    call random_number(harvest)
    cc1(i) = harvest(1)
    cc2(i) = harvest(2)
    cc3(i) = dble(0)
  enddo

  write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
  write(6, FMT = '(A)', ADVANCE = 'NO') "no"

  call cpu_time(cpu_start)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_start = dble(clock_reading) / dble(clock_rate)

  ! Work.
  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
        enddo
     enddo
  enddo

  sum = dble(0)
  do j = 1, n**2
        sum = sum + cc3(i)
  enddo
  sum = sum / (n * n)

  call cpu_time(cpu_end)
  call system_clock (clock_reading, clock_rate, clock_max)
  wall_end = dble(clock_reading) / dble(clock_rate)

  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
  write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
  write(6, FMT = '(A,F23.16)') " sum ", sum

#ifdef ALLOCATION_DYNAMIC
  deallocate(cc1)
  deallocate(cc2)
  deallocate(cc3)
#endif

end program Test

deb2014 · Answer 2 · 2015-07-23T10:51:27+0000

It is clear that the temporary CPU discrepancy between static and dynamic allocation in the notranspose test is due to

  do j = 1, n
     do i = 1, n
        do k = 1, n
           cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
        enddo
     enddo
  enddo

for Fortran 90 program and

  // Work.
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < n; ++j)
      for (int k = 0; k < n; ++k)
        {
  #if defined(ALLOCATION_DYNAMIC)
          cc3(i, j) += cc1(i, k) * cc2(k, j);
  #else
          cc3[i][j] += cc1[i][k] * cc2[k][j];
  #endif
        }

for C ++.

The compiler is capable of doing deeper optimizations ( -fopt-info-optimize

) when using static allocation, in which case it outputs (like F90 / C ++):

Vectorization in src / test.cpp: 163

src / test.cpp: 163: note: LOOP VECTORIZED.

src / test.cpp: 163: note: OUTER LOP VECTORED.

It does nothing for dynamic allocation, I am very surprised at this, I do not understand why the compiler can optimize this non-contiguous memory access ( cc3[k][j]

) with static allocation and not with dynamic allocation ( cc3(k, j)

).

Differences in processor frequency between static and dynamic allocation

More articles: