How to solve CUDA Thrust library - for_each sync error?

Question

How to solve CUDA Thrust library - for_each sync error?

I am trying to change a simple dynamic vector in CUDA using the underlying CUDA library. But the message "launch_closure_by_value" appears on the screen, indicating that the error is related to some synchronization process.

Simple modification of a 1D dynamic array is not possible due to this error.

My code segment causing the error looks like this.

from .cpp file I call setIndexedGrid which is defined in System.cu

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

Code segment in System.cu:

void
setIndexedGridInfo(float* a, float*b)
{

    thrust::device_ptr<float> d_oldData(a);
    thrust::device_ptr<float> d_newData(b);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData,d_newData)),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData+8,d_newData+8)),
        grid_functor(c));
}

grid_functor is defined in _kernel.cu

struct grid_functor
{
    float a;

    __host__ __device__
    grid_functor(float grid_Info) : a(grid_Info) {}

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float data = thrust::get<0>(t);
        float pos = data + 0.1;
        thrust::get<1>(t) = pos;
    }

};

I also get these in the output window (I am using Visual Studio):

First chance exception at 0x000007fefdc7cacd in Particles.exe: Microsoft C ++ exception: cudaError_enum in memory location 0x0029eb60 .. First chance exception at 0x000007fefdc7cacd in smokeParticles.exe: Microsoft C ++ exception: thrust :: system :: system_error in cell memory 0x0029ecf0 .. Not Handled Exception: 0x000007fefdc7cacd in Particles.exe: Microsoft C ++ Exception: thrust :: system :: system_error at memory location 0x0029ecf0 ..

What is causing the problem?

+3

foreach gpgpu cuda thrust

Emre turkoz 10 Mar 12 at 8:34

source to share

1 answer

talonmies · Accepted Answer · 2012-03-10T09:05:33+0000

You are trying to use host memory pointers in functions that expect pointers in device memory. This code is the problem:

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

.....

thrust::device_ptr<float> d_oldData(a);
thrust::device_ptr<float> d_newData(b);

thrust::device_ptr

is intended to "wrap" the device memory pointer allocated by the CUDA API so that it can use it. You are trying to treat a node pointer directly as a device pointer. It's illegal. You can change your function setIndexedGridInfo

like this:

void setIndexedGridInfo(float* a, float*b, const int n)
{

    thrust::device_vector<float> d_oldData(a,a+n);
    thrust::device_vector<float> d_newData(b,b+n);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())),
        grid_functor(c));
}

The constructor device_vector

will allocate device memory and then copy the contents of your host memory to the device. This should fix the error you are seeing, although I'm not sure what you are trying to do with the iterator for_each

, and if your functor is correct.

Edit:

Here is the complete, compiled, executable version of your code:

#include <cstdlib>
#include <cstdio>
#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/copy.h>

struct grid_functor
{
    float a;

    __host__ __device__
    grid_functor(float grid_Info) : a(grid_Info) {}

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float data = thrust::get<0>(t);
        float pos = data + 0.1f;
        thrust::get<1>(t) = pos;
    }

};

void setIndexedGridInfo(float* a, float*b, const int n)
{

    thrust::device_vector<float> d_oldData(a,a+n);
    thrust::device_vector<float> d_newData(b,b+n);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())),
        grid_functor(c));

    thrust::copy(d_newData.begin(), d_newData.end(), b);
}

int main(void)
{
    const int n = 8;
    float* a= (float*)(malloc(n*sizeof(float))); 
    a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
    float* b = (float*)(malloc(n*sizeof(float)));
    setIndexedGridInfo(a,b,n);

    for(int i=0; i<n; i++) {
        fprintf(stdout, "%d (%f,%f)\n", i, a[i], b[i]);
    }

    return 0;
}

I can compile and run this code on an OS 10.6.8 host with CUDA 4.1 like this:

$ nvcc -Xptxas="-v" -arch=sm_12 -g -G thrustforeach.cu 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space
ptxas info    : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEi12grid_functorEEEEvT_' for 'sm_12'
ptxas info    : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1]
ptxas info    : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEj12grid_functorEEEEvT_' for 'sm_12'
ptxas info    : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1]

$ ./a.out
0 (0.000000,0.100000)
1 (1.000000,1.100000)
2 (2.000000,2.100000)
3 (3.000000,3.100000)
4 (4.000000,4.100000)
5 (5.000000,5.100000)
6 (6.000000,6.100000)
7 (7.000000,7.100000)

How to solve CUDA Thrust library - for_each sync error?

More articles: