CUDA device function pointers in a structure with no static pointers or character copies

Question

CUDA device function pointers in a structure with no static pointers or character copies

My planned program flow would look like this if it were possible:

typedef struct structure_t
{
  [...]
  /* device function pointer. */
  __device__ float (*function_pointer)(float, float, float[]);
  [...]
} structure;

[...]

/* function to be assigned. */
__device__ float
my_function (float a, float b, float c[])
{
  /* do some stuff on the device. */
  [...]
}

void
some_structure_initialization_function (structure *st)
{
  /* assign. */
  st->function_pointer = my_function;
  [...]
}

This is not possible and ends up with a familiar compile-time error regarding the placement of __device__ in a struct.

 error: attribute "device" does not apply here

Below are examples of similar types of problems on stackoverflow, but they all involve using static pointers outside of the framework. Examples are device function pointers as structure elements and device function pointers . Previously, I used a similar approach in other codes where it was easy for me to use static device pointers and define them outside of any structures. This is currently a problem. It is written as an API of sorts, and the user can define one or two or dozens of structures that should include a pointer to a device function. Thus, defining static device pointers outside of the framework is a major problem.

I'm sure the answer exists in the posts I linked above using the use of character copies, but I have not been able to use them successfully.

+1

c ++ gpu cuda

coastal 08 Aug At 2:57 am

source to share

1 answer

talonmies · Accepted Answer · 2012-08-08T13:33:32+0000

What you are trying to do is possible, but you made several mistakes in the way you declare and define structures that will hold and use a function pointer.

This is not possible and ends up with a familiar compile-time error regarding the placement of __device__ in a struct.

 error: attribute "device" does not apply here

This only happens because you are trying to assign a memory location to a data member of a structure or class, which is illegal in CUDA. The memory size of all members of a class or data structure is implicitly set when the class is defined or instantiated. So something is only weaker than the other (and more specific):

typedef float (* fp)(float, float, float4);

struct functor
{
    float c0, c1;
    fp f;

    __device__ __host__
    functor(float _c0, float _c1, fp _f) : c0(_c0), c1(_c1), f(_f) {};

    __device__ __host__
    float operator()(float4 x) { return f(c0, c1, x); };
};

__global__
void kernel(float c0, float c1, fp f, const float4 * x, float * y, int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    struct functor op(c0, c1, f);
    for(int i = tid; i < N; i += blockDim.x * gridDim.x) {
        y[i] = op(x[i]);
    }
}

fine. A function pointer fp

is functor

implicitly a function __device__

when instantiated functor

in device code. If it was created in the main code, the function pointer will implicitly be a host function. In the kernel, a device function pointer passed as an argument is used to instantiate functor

. All are perfectly legal.

I believe that I am correct in saying that there is no direct way to get the address of a function __device__

in the host code, so you still need some static declarations and character manipulation. It may be different in CUDA 5, but I haven't tested it to see. If we formulate the device code above with a few functions __device__

and some supporting host code:

__device__ __host__ 
float f1 (float a, float b, float4 c)
{
    return a + (b * c.x) +  (b * c.y) + (b * c.z) + (b * c.w);
}

__device__ __host__
float f2 (float a, float b, float4 c)
{
    return a + b + c.x + c.y + c.z + c.w;
}

__constant__ fp function_table[] = {f1, f2};

int main(void)
{
    const float c1 = 1.0f, c2 = 2.0f;
    const int n = 20;
    float4 vin[n];
    float vout1[n], vout2[n];
    for(int i=0, j=0; i<n; i++) {
        vin[i].x = j++; vin[i].y = j++;
        vin[i].z = j++; vin[i].w = j++;
    }

    float4 * _vin;
    float * _vout1, * _vout2;
    size_t sz4 = sizeof(float4) * size_t(n);
    size_t sz1 = sizeof(float) * size_t(n);
    cudaMalloc((void **)&_vin, sz4);
    cudaMalloc((void **)&_vout1, sz1);
    cudaMalloc((void **)&_vout2, sz1);
    cudaMemcpy(_vin, &vin[0], sz4, cudaMemcpyHostToDevice);

    fp funcs[2];
    cudaMemcpyFromSymbol(&funcs, "function_table", 2 * sizeof(fp));

    kernel<<<1,32>>>(c1, c2, funcs[0], _vin, _vout1, n);
    cudaMemcpy(&vout1[0], _vout1, sz1, cudaMemcpyDeviceToHost); 

    kernel<<<1,32>>>(c1, c2, funcs[1], _vin, _vout2, n);
    cudaMemcpy(&vout2[0], _vout2, sz1, cudaMemcpyDeviceToHost); 

    struct functor func1(c1, c2, f1), func2(c1, c2, f2); 
    for(int i=0; i<n; i++) {
        printf("%2d %6.f %6.f (%6.f,%6.f,%6.f,%6.f ) %6.f %6.f %6.f %6.f\n", 
                i, c1, c2, vin[i].x, vin[i].y, vin[i].z, vin[i].w,
                vout1[i], func1(vin[i]), vout2[i], func2(vin[i]));
    }

    return 0;
}

you get a fully compiled and executable example. Here, two functions __device__

and a static function table provide a mechanism for the host code to look up function pointers __device__

at runtime. The kernel is called once with each function __device__

, and the results are displayed along with the same functor and functions that were created and called from the host code (and therefore run on the host) for comparison:

$ nvcc -arch=sm_30 -Xptxas="-v" -o function_pointer function_pointer.cu 

ptxas info    : Compiling entry function '_Z6kernelffPFfff6float4EPKS_Pfi' for 'sm_30'
ptxas info    : Function properties for _Z6kernelffPFfff6float4EPKS_Pfi
    16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for _Z2f1ff6float4
    24 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for _Z2f2ff6float4
    24 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 16 registers, 356 bytes cmem[0], 16 bytes cmem[3]

$ ./function_pointer 
 0      1      2 (     0,     1,     2,     3 )     13     13      9      9
 1      1      2 (     4,     5,     6,     7 )     45     45     25     25
 2      1      2 (     8,     9,    10,    11 )     77     77     41     41
 3      1      2 (    12,    13,    14,    15 )    109    109     57     57
 4      1      2 (    16,    17,    18,    19 )    141    141     73     73
 5      1      2 (    20,    21,    22,    23 )    173    173     89     89
 6      1      2 (    24,    25,    26,    27 )    205    205    105    105
 7      1      2 (    28,    29,    30,    31 )    237    237    121    121
 8      1      2 (    32,    33,    34,    35 )    269    269    137    137
 9      1      2 (    36,    37,    38,    39 )    301    301    153    153
10      1      2 (    40,    41,    42,    43 )    333    333    169    169
11      1      2 (    44,    45,    46,    47 )    365    365    185    185
12      1      2 (    48,    49,    50,    51 )    397    397    201    201
13      1      2 (    52,    53,    54,    55 )    429    429    217    217
14      1      2 (    56,    57,    58,    59 )    461    461    233    233
15      1      2 (    60,    61,    62,    63 )    493    493    249    249
16      1      2 (    64,    65,    66,    67 )    525    525    265    265
17      1      2 (    68,    69,    70,    71 )    557    557    281    281
18      1      2 (    72,    73,    74,    75 )    589    589    297    297
19      1      2 (    76,    77,    78,    79 )    621    621    313    313

If I understand your question correctly, the above example should give you pretty much all the design patterns you need to implement your ideas in device code.

CUDA device function pointers in a structure with no static pointers or character copies

More articles: