Comparison of MPI_Send / Recv and MPI_Scatter / Gather

Question

Comparison of MPI_Send / Recv and MPI_Scatter / Gather

I am using MPI to split a matrix and send them to N processes, but I found that MPI_Scatter / Gather was not efficient enough. I wrote two programs to compare MPI_Send / Recv and MPI_Scatter / Gather.

MPI_Send / Recv:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0

double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
double* create_vector(uint32_t n);
int print_matrix(double *m, uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double *A, *B, *C, *A_buf, *C_buf;
    double t_start, t_end, buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank, recv_len;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_SUB_VECTOR;
    MPI_Comm comm;
    MPI_Status status;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_SUB_VECTOR);
    MPI_Type_commit(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        //A: M*N
        A = create_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = create_vector(N);
    A_buf = create_matrix(nrows, N);
    C_buf = zero_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    if(rank == MASTER)
    {
        //exclude the time of establishing TCP connections
        for(int i = 1;i < size;i++)
            MPI_Send(&buf, 1, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);

        t_start = MPI_Wtime();
        for(int i = 0;i < nrows*N;i++)
            A_buf[i] = A[i];

        //send submatrix to other processes
        for(int i = 1;i < size;i++)
        {
            MPI_Send(&A[i*nrows*N], 1, MPI_MATRIX, i, 0, MPI_COMM_WORLD);
            MPI_Send(B, 1, MPI_VECTOR, i, 0, MPI_COMM_WORLD);
        }
    }
    else
    {
        //receive to establish connection with MASTER
        MPI_Recv(&buf, 1, MPI_DOUBLE, MASTER, 0, MPI_COMM_WORLD, &status);

        //receive matrix
        MPI_Recv(A_buf, 1, MPI_MATRIX, MASTER, 0, MPI_COMM_WORLD, &status);    
        MPI_Recv(B, 1, MPI_VECTOR, MASTER, 0, MPI_COMM_WORLD, &status);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        for(int i = 0;i < nrows;i++)
            C[i] = C_buf[i];

        for(int i = 1;i < size;i++)
            MPI_Recv(&C[i*nrows], 1, MPI_SUB_VECTOR, i, 0, MPI_COMM_WORLD, &status);

        t_end = MPI_Wtime();
        printf("%dx%d/%d: %7.4f\n", M, N, size, t_end - t_start);
    }
    else
    {
        MPI_Send(C_buf, 1, MPI_SUB_VECTOR, MASTER, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        free(A);
        free(C);
    }

    free(B);
    free(A_buf);
    free(C_buf);

    MPI_Finalize();

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)0;
    }

    return matrix;
}

double* create_vector(uint32_t n)
{
    return create_matrix(n, 1);
}

MPI_Scatter / Collect:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double t_start, t_end, buf;
    double *A, *B, *C, *A_buf, *C_buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank;
    uint32_t i_start, i_end;
    MPI_Comm comm;
    MPI_Status status;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_RESULT;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_RESULT);
    MPI_Type_commit(&MPI_RESULT);

    if(rank == MASTER)
    {
        //A: M*N
        A = zero_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = zero_matrix(N, 1);
    A_buf = create_matrix(nrows, N);
    C_buf = create_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    //exclude the time of establishing TCP connections
    MPI_Bcast(&buf, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_start = MPI_Wtime();
    }

    // scatter A
    MPI_Scatter(A, 1, MPI_MATRIX, A_buf, 1, MPI_MATRIX, 0, MPI_COMM_WORLD);

    // broadcast B
    MPI_Bcast(B, 1, MPI_VECTOR, 0, MPI_COMM_WORLD);

    // gather C
    MPI_Gather(C_buf, 1, MPI_RESULT, C, 1, MPI_RESULT, 0, MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_end = MPI_Wtime();
        printf("%d %7.4f\n", size, t_end - t_start);

        free(A);
        free(C);
    }

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_RESULT);

    free(B);
    free(A_buf);
    free(C_buf);

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)rand();
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}

I used the following script to run both of them:

#!/bin/bash
dims="4096"
ntasks="1 2 4 8"
echo -n "" > log
for dim in $dims;
do
    echo "dim=$dim:"
    for n in $ntasks;
    do
        srun --ntasks=$n --ntasks-per-node=1 --cpu-freq=2900000 ./matrix $dim $dim | tee -a log
    done
done

transmission time:

program        |   N=1   |   N=2   |   N=4   |   N=8   |
--------------------------------------------------------
send/recv      | 0.0684s | 0.0638s | 0.0654s | 0.0638s |
scatter/gather | 0.0367s | 0.0492s | 0.0765s | 0.1283s |

The scatter / collect data transfer time is increasing so quickly, do I still have a reason to use it instead of a send / recv loop? I know scatter is transferring dispatch and collecting, it is a recv wrapper, but what do they do besides?

+3

performance matrix mpi scatter send

Cricket June 20. At 13:01

source to share

1 answer

mort · Answer 1 · 2015-06-22T09:27:43+0000

To clarify, MPI_Scatter and MPI_Gather (most likely) use MPI_Send AND MPI_Recv under the hood.

From your code examples, it seems like you don't really understand how MPI works:

You do not need to perform a send or receive operation to "establish a connection". MPI operations will generally be performed implicitly.
In your Gather / Scatter example, you first spread the data with using MPI_Scatter

, then you pass some more data with using MPI_Bcast

, and then you just collect the data again with using MPI_Gather

, without doing any calculations in between.
In your examples, you don't need explicit synchronization with MPI_Barrier

.

When you structure your program correctly, you will see an increase in productivity. Apart from these problems, there is also MPI: unfortunately, the MPI standard does not make any performance guarantees, but leaves it to the actual implementation to do its best. MPI_Scatter / Gather, depending on the implementation you are using, I try to optimize large messages and / or a large number of processes, which naturally comes with some overhead.

You can try another MPI implementation (for open-source, see MVARPICH for example) to see if the one you're using right now just doesn't work well. However, investigating this will only make sense after you get the correct code.

Also, it's best not to use a prefix MPI_

. This makes your code difficult to read, and if I'm not mistaken, the MPI standard reserves a prefix for MPI library functions.

Comparison of MPI_Send / Recv and MPI_Scatter / Gather

More articles: