Passing CUDA Function Pointers with Libraries

I am using CUDA and am trying to use a function pointer to pass a CUDA function to a library that later uses that function in its device kernel, similar to the CUDA function pointer example.

The important sections of the code are:

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__device__ void gpuTest(int type , void *data)
{
  ....
}
__device__ qsched_funtype function = gpuTest;

void main(...)
{
//Various initialization setup.

if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
    error("Failed to copy function pointer from device");

qsched_run_CUDA( &s , func );
}

      

The qsched_run_CUDA function is a library function that does some initialization, copies the function pointer to the device (to a variable can be seen), and then starts the kernel, which at some points calls the gpuTest function using that function pointer.

The code compiles correctly if I use -G with the following nvcc call:

nvcc -g -G -m64 -I../src ../src/.libs/libquicksched_cuda.a -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda -DWITH_CUDA -gencode arch=compute_30,code=sm_30 -lgomp test_gpu_simple.cu -o out.out

      

Where

../src/.libs/libquicksched_cuda.a

      

is a library containing the qsched_run_CUDA function.

The moment I removed the -G flag from my NVCC call then all of a sudden everything breaks down and the kernel run in qsched_run_CUDA crashes with an invalid program counter Error, and the function pointer (including in my own .cu file) is set to 0x4.

Presumably I need to use a separate compilation in CUDA ( http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#using-separate-compilation-in-cuda ) as described in Pointer Context Cuda functions - however I'm not sure how to do this when using library functions, neither the nvcc manual nor the stackoverflow link makes it obvious how to do this.

Does anyone have any experience? I tried to briefly try to work out nvlink to do this, but I didn't get far (I didn't feel like I passed the library to it).

+1


source to share


1 answer


Yes, you will need to use a separate compilation. I've put together a simple test case based on what you've shown so far and using nvcc a separate compilation library example from the documentation. Here is the code:

kernel_lib.cu:

#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__global__ void mykernel(int type, void *data, void *func){
  ((qsched_funtype)func)(type, data);
}

int qsched_run_CUDA(int val, void *d_data, void *func)
{
  mykernel<<<1,1>>>(val, d_data, func);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  return 0;
}

      

main.cu:



#include <stdio.h>
#define DATA_VAL 5

int qsched_run_CUDA(int, void*, void*);

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__device__ void gpuTest(int type , void *data)
{
  ((int *)data)[0] = type;
}
__device__ qsched_funtype function = gpuTest;


int main()
{
  void *func;
  cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype));
  cudaCheckErrors("Failed to copy function pointer from device");
  int h_data = 0;
  int *d_data;
  cudaMalloc((void **)&d_data, sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  cudaMemset(d_data, 0, sizeof(int));
  cudaCheckErrors("cudaMemset fail");
  int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func);
  if (return_val != 0) printf("return code error\n");
  cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy fail");
  if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;}
  printf("Success!\n");
  return 0;
}

      

compilation commands and result:

$ nvcc -arch=sm_20 -dc kernel_lib.cu
$ nvcc -lib kernel_lib.o -o test.a
$ nvcc -arch=sm_20 -dc main.cu
$ nvcc -arch=sm_20 main.o test.a -o test
$ ./test
Success!
$

      

I used CUDA 5.0 for this test.

+3


source







All Articles