Passing an inline function as a template parameter
I am trying to pass an atomicAdd function to another function as a template parameter.
Here is my Kernel1:
template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
atomicFunc(address, 1);
}
Try 1:
myfunc1<<<1,1>>>(val.dev_ptr, atomicAdd);
This does not work because the compiler cannot match the expected function signature.
Try it 2: First, I am porting atomAdd into a custom function MyAtomicAdd.
template<typename T>
__device__ void MyAtomicAdd(T *address, T val) {
atomicAdd(address, val);
}
Then I defined a function pointer called TAtomic and I declare the TAtomic parameter as a template.
typedef void (*TAtomic)(float *,float);
template<typename T, TAtomic atomicFunc>
__global__ void myfunc2(T *address) {
atomicFunc(address, 1);
}
myfunc2<float, MyAtomicAdd><<<1,1>>>(dev_ptr);
CUDA_CHECK(cudaDeviceSynchronize());
Actually, Try 2 works. But I dont want to use typedef. I need something more general.
Try 3: Just pass MyAtomicAdd to myfunc1.
myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<float>);
CUDA_CHECK(cudaDeviceSynchronize());
The compiler can compile the code. But when I run the program the error was reported:
"ERROR in /home/liang/groute-dev/samples/framework/pagerank.cu:70: invalid program counter (76)"
I'm just wondering why try 3 isn't working? And is there any simple or gentle way to implement this requirement? Thank.
source to share
Try 3 doesn't work because you are trying to take the address of the function __device__
into the host code, which is illegal in CUDA:
myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<float>);
^
effectively a function pointer - address of a __device__ function
Such attempts at use in CUDA allow some sort of "address", but this is garbage, so when you try to use it as the actual entry point of a function into the device code, you get the error: invalid program counter
(or in some cases just illegal address
).
You can make your Try 3 method (without typedef
) by wrapping the inner expression in a functor instead of a naked function __device__
:
$ cat t48.cu
#include <stdio.h>
template<typename T>
__device__ void MyAtomicAdd(T *address, T val) {
atomicAdd(address, val);
}
template <typename T>
struct myatomicadd
{
__device__ T operator()(T *addr, T val){
return atomicAdd(addr, val);
}
};
template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
atomicFunc(address, (T)1);
}
int main(){
int *dev_ptr;
cudaMalloc(&dev_ptr, sizeof(int));
cudaMemset(dev_ptr, 0, sizeof(int));
// myfunc1<<<1,1>>>(dev_ptr, MyAtomicAdd<int>);
myfunc1<<<1,1>>>(dev_ptr, myatomicadd<int>());
int h = 0;
cudaMemcpy(&h, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("h = %d\n", h);
return 0;
}
$ nvcc -arch=sm_35 -o t48 t48.cu
$ cuda-memcheck ./t48
========= CUDA-MEMCHECK
h = 1
========= ERROR SUMMARY: 0 errors
$
We can implement a slightly simpler version of this question by letting you infer the type of the functor template from the core template type:
$ cat t48.cu
#include <stdio.h>
struct myatomicadd
{
template <typename T>
__device__ T operator()(T *addr, T val){
return atomicAdd(addr, val);
}
};
template<typename T, typename TAtomic>
__global__ void myfunc1(T *address, TAtomic atomicFunc) {
atomicFunc(address, (T)1);
}
int main(){
int *dev_ptr;
cudaMalloc(&dev_ptr, sizeof(int));
cudaMemset(dev_ptr, 0, sizeof(int));
myfunc1<<<1,1>>>(dev_ptr, myatomicadd());
int h = 0;
cudaMemcpy(&h, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("h = %d\n", h);
float *dev_ptrf;
cudaMalloc(&dev_ptrf, sizeof(float));
cudaMemset(dev_ptrf, 0, sizeof(float));
myfunc1<<<1,1>>>(dev_ptrf, myatomicadd());
float hf = 0;
cudaMemcpy(&hf, dev_ptrf, sizeof(float), cudaMemcpyDeviceToHost);
printf("hf = %f\n", hf);
return 0;
}
$ nvcc -arch=sm_35 -o t48 t48.cu
$ cuda-memcheck ./t48
========= CUDA-MEMCHECK
h = 1
hf = 1.000000
========= ERROR SUMMARY: 0 errors
$
Additional steps on using device function pointers in CUDA are linked to this answer .
source to share