Cuda shared memory - inconsistent results

I am trying to do a parallel reduction to sum an array in CUDA. I am currently passing in an array that stores the sum of the elements in each block. This is my code:

#include <cstdlib>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <helper_cuda.h>
#include <host_config.h>
#define THREADS_PER_BLOCK 256
#define CUDA_ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

using namespace std;

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

struct double3c {
    double x; 
    double y;
    double z;
    __host__ __device__ double3c() : x(0), y(0), z(0) {}
    __host__ __device__ double3c(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
    __host__ __device__ double3c& operator+=(const double3c& rhs) { x += rhs.x; y += rhs.y; z += rhs.z;}
    __host__ __device__ double3c& operator/=(const double& rhs) { x /= rhs; y /= rhs; z /= rhs;}

};

class VectorField {
public:
    double3c *data;
    int size_x, size_y, size_z;
    bool is_copy;  

    __host__ VectorField () {}

    __host__ VectorField (int x, int y, int z) {
        size_x = x; size_y = y; size_z = z;
        is_copy = false;
        CUDA_ERROR_CHECK (cudaMalloc(&data, x * y * z * sizeof(double3c))); 
    }

    __host__ VectorField (const VectorField& other) {
        size_x = other.size_x; size_y = other.size_y; size_z = other.size_z;
        this->data = other.data;
        is_copy = true;
    }

    __host__ ~VectorField() {     
        if (!is_copy) CUDA_ERROR_CHECK (cudaFree(data));
    }
};

__global__ void KernelCalculateMeanFieldBlock (VectorField m, double3c* result) {
    __shared__ double3c blockmean[THREADS_PER_BLOCK];    
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
    else blockmean[threadIdx.x] = double3c(0,0,0);
    __syncthreads();
    for(int s = THREADS_PER_BLOCK / 2; s > 0; s /= 2) {
        if (threadIdx.x < s) blockmean[threadIdx.x] += blockmean[threadIdx.x + s];
        __syncthreads();
    }


    if(threadIdx.x == 0) result[blockIdx.x] = blockmean[0];   
}

double3c CalculateMeanField (VectorField& m) { 
    int blocknum = (m.size_x * m.size_y * m.size_z - 1) / THREADS_PER_BLOCK + 1;
    double3c *mean = new double3c[blocknum]();
    double3c *cu_mean;
    CUDA_ERROR_CHECK (cudaMalloc(&cu_mean, sizeof(double3c) * blocknum));
    CUDA_ERROR_CHECK (cudaMemset (cu_mean, 0, sizeof(double3c) * blocknum));

        KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK>>> (m, cu_mean);
        CUDA_ERROR_CHECK (cudaPeekAtLastError());
        CUDA_ERROR_CHECK (cudaDeviceSynchronize());
        CUDA_ERROR_CHECK (cudaMemcpy(mean, cu_mean, sizeof(double3c) * blocknum, cudaMemcpyDeviceToHost));

    CUDA_ERROR_CHECK (cudaFree(cu_mean));
    for (int i = 1; i < blocknum; i++) {mean[0] += mean[i];}
    mean[0] /= m.size_x * m.size_y * m.size_z;
    double3c aux = mean[0];
    delete[] mean;
    return aux;
}



int main() {
    VectorField m(100,100,100);
    double3c sum = CalculateMeanField (m);
    cout <<  sum.x << '\t' << sum.y << '\t' <<sum.z;  


    return 0;
}

      

EDIT

Functional code submitted. Plotting a VectorField

with 10x10x10 elements works fine and gives an average of 1, but plotting it with 100x100x100 elements gives an average of ~ 0.97 (it varies from run to run). Is this the correct way to do parallel pruning, or should I stick with one kernel run per block?

+3


source to share


1 answer


When I compile the code you have on linux, I get the following warning:

t614.cu(55): warning: __shared__ memory variable with non-empty constructor or destructor (potential race between threads)

      

This type of warning should not be ignored. It's linked to this line of code:

__shared__ double3c blockmean[THREADS_PER_BLOCK]; 

      

Since the initialization of these objects stored in shared memory (by the constructor) will happen in some arbitrary order, and you have no barrier between this and subsequent code that will also set these values, unpredictable things (*) can happen.

If I paste __syncthreads()

in the code to highlight the constructor activity from the following code, I get the expected results:

__shared__ double3c blockmean[THREADS_PER_BLOCK];    
int index = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();  // add this line
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
else blockmean[threadIdx.x] = double3c(0,0,0);
__syncthreads();

      

However, this still leaves us with a warning. The modification to fix this and warn you will aim to dynamically allocate the required size __shared__

. Change your shared memory declaration to the following:



extern __shared__ double3c blockmean[];

      

and change your kernel call:

KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK, THREADS_PER_BLOCK*sizeof(double3c)>>> (m, cu_mean);

      

This will remove the warning, produce the correct result, and avoid unnecessary constructor traffic on the shared memory variable. (And the additional __syncthreads()

one described above is no longer needed.)

* regarding "unpredictable things", if you look under the hood by checking either the generated SASS ( cuobjdump -sass ...) or PTX (**) (nvcc -ptx ...), you can see that each thread initializes the entire array objects __shared__

to zero (default constructor behavior). As a result of this, some of the threads (like skews) can move forward and start filling the shared memory area according to this line:

if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);

      

Then, when other warps start, these threads will clear the entire shared memory array again. This racing behavior leads to unpredictable results.

** I generally don't recommend evaluating code behavior by checking PTX, but in this case it is truly instructive. The final stages of compilation will not optimize constructor behavior.

+11


source







All Articles