OpenMP SIMD abbreviation with custom operator

I have the following loop that I would like to speed up with #pragma omp simd

:

#define N 1024
double* data = new double[N];
// Generate data, not important how.

double mean = 0.0
for (size_t i = 0; i < N; i++) {
    mean += (data[i] - mean) / (i+1);
}

      

As I expected, just put #pragma omp simd

just before the loop will have no effect (I am looking into running time). I can easily deal with a multi-threaded enclosure using #pragma omp parallel for reduction(...)

with a custom reducer like below, but how can I use OpenMP SIMD here?

I am using the following class to implement the + and + = operators to add double

to the medium run, and also to combine the two run tools:

class RunningMean {
    private:
        double mean;
        size_t count;

    public:
        RunningMean(): mean(0), count(0) {}
        RunningMean(double m, size_t c): mean(m), count(c) {}

        RunningMean operator+(RunningMean& rhs) {
            size_t c = this->count + rhs.count;
            double m = (this->mean*this->count + rhs.mean*rhs.count) / c;
            return RunningMean(m, c);
        }

        RunningMean operator+(double rhs) {
            size_t c = this->count + 1;
            double m = this->mean + (rhs - this->mean) / c;
            return RunningMean(m, c);
        }

        RunningMean& operator+=(const RunningMean& rhs) {
            this->mean = this->mean*this->count + rhs.mean*rhs.count;
            this->count += rhs.count;
            this->mean /= this->count;
            return *this;
        }

        RunningMean& operator+=(double rhs) {
            this->count++;
            this->mean += (rhs - this->mean) / this->count;
            return *this;
        }

        double getMean() { return mean; }
        size_t getCount() { return count; }
};

      

The math for this comes from http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf . For multithreaded parallel reduction without SIMD, I do the following:

#pragma omp declare reduction (runningmean : RunningMean : omp_out += omp_in)
RunningMean mean;
#pragma omp parallel for reduction(runningmean:mean)
for (size_t i = 0; i < N; i++)
    mean += data[i];

      

This gives me 3.2X acceleration on my Core i7 2600k using 8 threads.

If I were to implement SIMD myself without OpenMP, I would just maintain 4 means in a vector, 4 counts in another vector (assuming AVX instructions are used), and keep adding 4-element double precision vectors using the vectorized version operator+(double rhs)

. Once this is done, I will add 4 pairs of means and counters using math from operator+=

. How can I instruct OpenMP to do this?

+3


source to share


2 answers


The problem is

mean += (data[i] - mean) / (i+1);

      

not easy to SIMD. However, with a thorough study of the math, it was possible to vectorize this effortlessly.

The key forum is

mean(n+m) = (n*mean(n) + m*mean(m))/(n+m)

      

which shows how to add numbers means n

and average m

. This can be seen in your operator definition RunningMean operator+(RunningMean& rhs)

. This explains why your parallel code works. I think it makes more sense if we deconstruct your C ++ code:

double mean = 0.0;
int count = 0;
#pragma omp parallel
{
    double mean_private = 0.0;
    int count_private = 0;
    #pragma omp for nowait
    for(size_t i=0; i<N; i++) {
        count_private ++;
        mean_private += (data[i] - mean_private)/count_private;
    }
    #pragma omp critical
    {
        mean = (count_private*mean_private + count*mean);
        count += count_private;
        mean /= count;
    }
}

      

But we can use the same idea with SIMD (and combine them together). But first, do the SIMD only part. Using AVX, we can handle four parallel tools at once. Each parallel average will process the following data items:



mean 1 data elements: 0,  4,  8, 12,...
mean 2 data elements: 1,  5,  9, 13,...
mean 3 data elements: 2,  6, 10, 14,...
mean 4 data elements: 3,  7, 11, 15,...

      

We looped through all the elements, then we add four parallel sums together and divide by four (since each sum runs over N / 4 elements).

Here is the code for this

double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
    count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
    __m256d t1 = _mm256_loadu_pd(&data[4*i]);
    __m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
    mean4 = _mm256_add_pd(t2, mean4);   
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
    count++;
    mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;

      

Finally, we can combine this with OpenMP to take full advantage of SIMD and MIMD like this

double mean = 0.0;
int count = 0;
#pragma omp parallel 
{
    double mean_private = 0.0;
    int count_private = 0;
    __m256d mean4 = _mm256_set1_pd(0.0);
    __m256d count4 = _mm256_set1_pd(0.0);
    #pragma omp for nowait
    for(size_t i=0; i<N/4; i++) {
        count_private++;
        count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
        __m256d t1 = _mm256_loadu_pd(&data[4*i]);
        __m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
        mean4 = _mm256_add_pd(t2, mean4);
    }
    __m256d t1 = _mm256_hadd_pd(mean4,mean4);
    __m128d t2 = _mm256_extractf128_pd(t1,1);
    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
    mean_private = _mm_cvtsd_f64(t3)/4;

    #pragma omp critical
    {
        mean = (count_private*mean_private + count*mean);
        count += count_private;
        mean /= count;
    }   
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
    count2++;
    mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;

      

And here is a working example (compile with -O3 -mavx -fopenmp

)

#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>

double mean_simd(double *data, const int N) {
    double mean = 0.0;
    __m256d mean4 = _mm256_set1_pd(0.0);
    __m256d count4 = _mm256_set1_pd(0.0);
    for(size_t i=0; i<N/4; i++) {
        count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
        __m256d t1 = _mm256_loadu_pd(&data[4*i]);
        __m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
        mean4 = _mm256_add_pd(t2, mean4);           
    }
    __m256d t1 = _mm256_hadd_pd(mean4,mean4);
    __m128d t2 = _mm256_extractf128_pd(t1,1);
    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
    mean = _mm_cvtsd_f64(t3)/4;
    int count = 0;
    double mean2 = 0;
    for(size_t i=4*(N/4); i<N; i++) {
        count++;
        mean2 += (data[i] - mean2)/count;
    }
    mean = (4*(N/4)*mean + count*mean2)/N;
    return mean;
}

double mean_simd_omp(double *data, const int N) {
    double mean = 0.0;
    int count = 0;
    #pragma omp parallel 
    {
        double mean_private = 0.0;
        int count_private = 0;
        __m256d mean4 = _mm256_set1_pd(0.0);
        __m256d count4 = _mm256_set1_pd(0.0);
        #pragma omp for nowait
        for(size_t i=0; i<N/4; i++) {
            count_private++;
            count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
            __m256d t1 = _mm256_loadu_pd(&data[4*i]);
            __m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
            mean4 = _mm256_add_pd(t2, mean4);
        }
        __m256d t1 = _mm256_hadd_pd(mean4,mean4);
        __m128d t2 = _mm256_extractf128_pd(t1,1);
        __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
        mean_private = _mm_cvtsd_f64(t3)/4;

        #pragma omp critical
        {
            mean = (count_private*mean_private + count*mean);
            count += count_private;
            mean /= count;
        }   
    }
    int count2 = 0;
    double mean2 = 0;
    for(size_t i=4*(N/4); i<N; i++) {
        count2++;
        mean2 += (data[i] - mean2)/count2;
    }
    mean = (4*(N/4)*mean + count2*mean2)/N;
    return mean;
}


int main() {
    const int N = 1001;
    double data[N];

    for(int i=0; i<N; i++) data[i] = 1.0*rand()/RAND_MAX;
    float sum = 0; for(int i=0; i<N; i++) sum+= data[i]; sum/=N;
    printf("mean %f\n", sum);
    printf("mean_simd %f\n", mean_simd(data, N);
    printf("mean_simd_omp %f\n", mean_simd_omp(data, N));
}

      

+2


source


KISS answer: Just compute the average outside the loop. Strip the following code:

double sum = 0.0;
for(size_t i = 0; i < N; i++) sum += data[i];
double mean = sum/N;

      



The sum is easy to parallelize, but you will not see any effect of SIMD optimization: it is purely memory-related, the processor will only wait for data from memory. If N

less 1024

, there is even a small point in parallelization, the synchronization overhead will absorb all the benefits.

0


source







All Articles