Passing an inline function as a template parameter

Question

Passing an inline function as a template parameter

I am trying to pass an atomicAdd function to another function as a template parameter.

Here is my Kernel1:

template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
    atomicFunc(address, 1);
}

Try 1:

myfunc1<<<1,1>>>(val.dev_ptr, atomicAdd);

This does not work because the compiler cannot match the expected function signature.

Try it 2: First, I am porting atomAdd into a custom function MyAtomicAdd.

template<typename T>
__device__ void MyAtomicAdd(T *address, T val) {
    atomicAdd(address, val);
}

Then I defined a function pointer called TAtomic and I declare the TAtomic parameter as a template.

typedef void (*TAtomic)(float *,float);

template<typename T, TAtomic atomicFunc>
__global__ void myfunc2(T *address) {
    atomicFunc(address, 1);
}

myfunc2<float, MyAtomicAdd><<<1,1>>>(dev_ptr);
CUDA_CHECK(cudaDeviceSynchronize());

Actually, Try 2 works. But I dont want to use typedef. I need something more general.

Try 3: Just pass MyAtomicAdd to myfunc1.

myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<float>);
CUDA_CHECK(cudaDeviceSynchronize());

The compiler can compile the code. But when I run the program the error was reported:

"ERROR in /home/liang/groute-dev/samples/framework/pagerank.cu:70: invalid program counter (76)"

I'm just wondering why try 3 isn't working? And is there any simple or gentle way to implement this requirement? Thank.

0

c ++ templates cuda

geng liang Feb 19 18 at 19:24

source to share

1 answer

Robert crovella · Accepted Answer · 2018-02-19T19:58:17+0000

Try 3 doesn't work because you are trying to take the address of the function __device__

into the host code, which is illegal in CUDA:

myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<float>);
                          ^
                          effectively a function pointer - address of a __device__ function

Such attempts at use in CUDA allow some sort of "address", but this is garbage, so when you try to use it as the actual entry point of a function into the device code, you get the error: invalid program counter

(or in some cases just illegal address

).

You can make your Try 3 method (without typedef

) by wrapping the inner expression in a functor instead of a naked function __device__

:

$ cat t48.cu
#include <stdio.h>

template<typename T>
__device__ void MyAtomicAdd(T *address, T val) {
    atomicAdd(address, val);
}


template <typename T>
struct myatomicadd
{
  __device__ T operator()(T *addr, T val){
    return atomicAdd(addr, val);
  }
};

template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
    atomicFunc(address, (T)1);
}


int main(){

  int *dev_ptr;
  cudaMalloc(&dev_ptr, sizeof(int));
  cudaMemset(dev_ptr, 0, sizeof(int));
//  myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<int>);
  myfunc1<<<1,1>>>(dev_ptr, myatomicadd<int>());
  int h = 0;
  cudaMemcpy(&h, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
  printf("h = %d\n", h);
  return 0;
}
$ nvcc -arch=sm_35 -o t48 t48.cu
$ cuda-memcheck ./t48
========= CUDA-MEMCHECK
h = 1
========= ERROR SUMMARY: 0 errors
$

We can implement a slightly simpler version of this question by letting you infer the type of the functor template from the core template type:

$ cat t48.cu
#include <stdio.h>

struct myatomicadd
{
template <typename T>
  __device__ T operator()(T *addr, T val){
    return atomicAdd(addr, val);
  }
};

template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
    atomicFunc(address, (T)1);
}


int main(){

  int *dev_ptr;
  cudaMalloc(&dev_ptr, sizeof(int));
  cudaMemset(dev_ptr, 0, sizeof(int));
  myfunc1<<<1,1>>>(dev_ptr, myatomicadd());
  int h = 0;
  cudaMemcpy(&h, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
  printf("h = %d\n", h);
  float *dev_ptrf;
  cudaMalloc(&dev_ptrf, sizeof(float));
  cudaMemset(dev_ptrf, 0, sizeof(float));
  myfunc1<<<1,1>>>(dev_ptrf, myatomicadd());
  float hf = 0;
  cudaMemcpy(&hf, dev_ptrf, sizeof(float), cudaMemcpyDeviceToHost);
  printf("hf = %f\n", hf);
  return 0;
}
$ nvcc -arch=sm_35 -o t48 t48.cu
$ cuda-memcheck ./t48
========= CUDA-MEMCHECK
h = 1
hf = 1.000000
========= ERROR SUMMARY: 0 errors
$

Additional steps on using device function pointers in CUDA are linked to this answer .

Passing an inline function as a template parameter

More articles: