Function pointer (for another kernel) as the arg kernel in CUDA

Question

Function pointer (for another kernel) as the arg kernel in CUDA

With dynamic parallelism in CUDA, you can run cores on the GPU side starting at a specific version. I have a wrapper function that takes a pointer to the kernel I want to use, and it either does it on the processor for older devices or on the GPU for newer devices. For the fallback path this is fine, for the GPU it is not and says the memory alignment is wrong.

Is there a way to do this in CUDA (7)? Are there some lower level calls that will give me the address of a pointer that will be corrected on the GPU?

The code below, the "TFunc" pattern is an attempt to get the compiler to do something different, but I tried it hard too.

template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 320)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const systemInfo *sysInfo, int count, TArgs... args)
{
    if(sysInfo->getCurrentDevice()->compareVersion("3.2") > 0)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

+3

c ++ gpgpu cuda function-pointers

George R May 02 '15 at 12:33

source to share

1 answer

Robert crovella · Accepted Answer · 2015-05-04T15:25:58+0000

EDIT: At the time I originally wrote this answer, I believe the statements were correct: failed to take the kernel address into the host code. However, I believe that since then something has changed in CUDA, and now (in CUDA 8 and maybe earlier) it is possible to take the kernel address into the host code (it is still not possible to take the function address __device__

into the host code, though. )

ORIGINAL ANSWER :

This question seems to come up from time to time, although the previous examples I can think of have to do with __device__

function calls instead of __global__

function calls .

In general, it is illegal to take the address of a device object (variable, function) into the host code.

One possible way to work around this (although it is not clear to me, it seems that there would be simpler dispatch mechanisms) is to extract the device address required "in the device code" and return this value to the host for use in the mailing list. In this case, I am creating a simple example that extracts the required device addresses into variables __device__

, but you can also write a kernel for this setting (ie To "give me the address of a pointer that gets corrected on the GPU" in your words).

Here's a rough example based on the code you showed:

$ cat t746.cu
#include <stdio.h>

__global__ void ckernel1(){

  printf("hello1\n");
}
__global__ void ckernel2(){

  printf("hello2\n");
}
__global__ void ckernel3(){

  printf("hello3\n");
}

__device__ void (*pck1)() = ckernel1;
__device__ void (*pck2)() = ckernel2;
__device__ void (*pck3)() = ckernel3;

template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}


int main(){

  void (*h_ckernel1)();
  void (*h_ckernel2)();
  void (*h_ckernel3)();
  cudaMemcpyFromSymbol(&h_ckernel1, pck1, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel2, pck2, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel3, pck3, sizeof(void *));
  Iterate(h_ckernel1, 350, 1);
  Iterate(h_ckernel2, 350, 1);
  Iterate(h_ckernel3, 350, 1);
  cudaDeviceSynchronize();
  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
Iterate on GPU
Iterate on GPU
hello1
hello2
hello3
========= ERROR SUMMARY: 0 errors
$

The above ( __device__

variable) method probably cannot be built to work with templated child kernels, but it may be possible to create a templated "extractor" kernel that returns the address of the (instantiated) templated child kernel. A rough idea of the "extractor" method is setup_kernel

given in the previous to the answer I linked. Here's an example of an example child kernel / extractor kernel method:

$ cat t746.cu
#include <stdio.h>

template <typename T>
__global__ void ckernel1(T *data){

  int my_val = (int)(*data+1);
  printf("hello: %d \n", my_val);
}
template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

template <typename T>
__global__ void extractor(void (**kernel)(T *)){

  *kernel = ckernel1<T>;
}

template <typename T>
void run_test(T init){

  void (*h_ckernel1)(T *);
  void (**d_ckernel1)(T *);
  T *d_data;
  cudaMalloc(&d_ckernel1, sizeof(void *));
  cudaMalloc(&d_data, sizeof(T));
  cudaMemcpy(d_data, &init, sizeof(T), cudaMemcpyHostToDevice);
  extractor<<<1,1>>>(d_ckernel1);
  cudaMemcpy((void *)&h_ckernel1, (void *)d_ckernel1, sizeof(void *), cudaMemcpyDeviceToHost);
  Iterate(h_ckernel1, 350, 1, d_data);
  cudaDeviceSynchronize();
  cudaFree(d_ckernel1);
  cudaFree(d_data);
  return;
}

int main(){

  run_test(1);
  run_test(2.0f);

  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
hello: 2
Iterate on GPU
hello: 3
========= ERROR SUMMARY: 0 errors
$

Function pointer (for another kernel) as the arg kernel in CUDA

More articles: