CUDA: copy dynamically created array of function pointers to processor to GPU memory

I would like to dynamically create a list of function pointers on the cpu (using some method push_back()

called from main()

) and copy it to a GPU array __constant__

or __device__

, without having to refer to static function pointers __device__

. I believe this question is related to my problem; however, my goal is to create an array of function pointers __host__

iteratively and then copy it into the array of function pointers __constant__

instead of initializing the latter in the declaration.

An example of working code with static function pointers (see here or here ):

common.h:

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),     file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];

// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
  fptr_t f;
  if (threadIdx.x < num_functions) {
    f = constant_fList[threadIdx.x];
    f(a,b);
  }
}

#endif

      

main.cu:

#include "common.h"

// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;

// Load function list to constant memory
void loadList_staticpointers() {
  fptr_t h_fList[num_functions];
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}

int main() {

  loadList_staticpointers();
  int a = 12, b = 15;
  kernel<<<1,3>>>(a, b);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

      

Specs: GeForce GTX 670 compiled for -arch=sm_30

, CUDA 6.5, Ubuntu 14.04

I want to avoid using static device function pointers, since adding each function would require user-side code maintenance - declaring a new static pointer like p_Add

or p_Mul

, manipulating with void loadList_functionpointers()

, etc. To clarify the situation, I am trying to do something like the following (emergency) code:

main_wrong.cu:

#include "common.h"
#include <vector>

// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;

// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}

// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call 
void UploadVector() {
  fptr_t* h_vpointer = vec_fList.data();
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}

int main() {

  addFunc(Add);
  addFunc(Subtract);
  addFunc(Multiply);
  int a = 12, b = 15;

  UploadVector();

  kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

      

I understand that function pointers pointing to host addresses are copied to the GPU and not valid for the kernel, which requires pointers pointing to GPU addresses when the function is called f(a,b)

. Populating a node-side array with device-side pointers will work for me with raw data (see this question ), but not with function pointers. Trivial attempts with Unified Memory have also failed ... so far I've only found static pointers on the device side. Is there no other way to copy a dynamically created array of arrays of function pointers to the GPU?

0


source to share


3 answers


If you can use C ++ 11 (supported since CUDA 7), you can use the following to automatically generate the function table:

template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
  constexpr auto num_f = sizeof...(Functions);

  constexpr fptr_t table[] = { Functions... };

  if (threadIdx.x < num_f)
  {
    fptr_t f = table[threadIdx.x];
    f(a,b);
  }
}

      



Then you call this kernel with

kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

      

+4


source


Inspired by ms answer , I decided to pass a function pointer as a template parameter - this was actually the key to solving my problem, and found that filling an array of __device__

function pointers dev_fList

from a function main()

iteratively without using static function pointers is indeed possible, plus C ++ 11 compatibility even need not!

Here is a working example of an array __device__

in global memory. I haven't tried its persistent memory copy yet, but once the global memory array has been satisfactorily created, I guess I cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice)

should do the trick.

The kernel kernel()

creates the GPU address for the function pointer dev_f

and copies the function f

that was passed as the template argument. Since this is an iterative process from the CPU, there is only one thread (thread 0

) involved in this kernel , which starts with the configuration <<<1,1>>>

. The static variable count_f

is indexed into dev_fList

.

common.h:

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];

#endif

      

main.cu:

#include "common.h"

// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;

// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
  fptr_t dev_f = f; // Create device function pointer
  dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
  dev_fList[idx](a,b); // Make sure that the array was populated correctly
}

// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
  if (count_f >= num_functions) {
    std::cout << "Error: not enough memory statically allocated on device!\n";
    exit(EXIT_FAILURE);
  }
  kernel<f><<<1,1>>>(a,b,count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());
  count_f++;
}

int main() {
  int a = 12, b = 15;
  addFunc<Add>(a,b);
  addFunc<Subtract>(a,b);
  addFunc<Multiply>(a,b);

  return 0;
}

      



Edit: added a copy of the array of function pointers to persistent memory

For what it's worth, here's how to copy our array dev_fList

into persistent memory:

In common.h:

__constant__ fptr_t cst_fList[num_functions];

__global__ void cst_test(int a, int b, int idx) {
   if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}

      

In the main.cu function main()

after adding all the desired functions:

  fptr_t *temp;
  gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );

  cst_test<<<1,count_f>>>(a,b, count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

      

This may seem ugly as I understand that memory is passed to the host through temp

and then back to the device; more elegant suggestions are welcome.

0


source


Can't use dynamically generated CUDA device function pointers (at least not without glitches or UB). Template based solutions work at compile time (not dynamically). The CUDA device function pointer comes close to seeing device symbols everywhere in global space. This means that a pointer to a device function must already be declared for each function. This also means that you cannot use regular C function pointers as a reference, for example. installed at runtime. Understanding, the use of CUDA device function pointers is questionable. Template-based approaches look convenient, but are not dynamic to define.

An example of displaying a structure with function pointers:

This example shows a structure that has some function pointers. In normal C ++ code, you can set and change pointers to a device function while the program is running (dynamically). With CUDA, this example below is not possible because function pointers in the structure are not valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To work around this, either the original function (the target of function pointers) or the global function pointers of the cuda device must be created. Both are not dynamic.

This is a dynamic assignment:

typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu)    (float, float, float);

// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
  /*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
  /*__host__ __device__*/ pDecayFu rad_decay;
  /*__host__ __device__*/ pDecayFu lrate_decay;
};

// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..

      

This is how it should be with CUDA, but it won't work because there is no valid device symbol :(

pDistanceFu hDistance; 
pDecayFu hRadDay; 
pDecayFu hLRateDecay; 

void DeviceAssign(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
  cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
  cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );

  dist.distance = hDistance;
  dist.rad_decay = hRadDay;
  dist.lrate_decay = hLRateDecay;
} 

      

Here's the classic way, but you noticed that it is not dynamic because the device symbol must refer to a function reference, not a pointer, which can be interleaved at runtime.

// .. and this would work
#ifdef __CUDACC__
  __host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
  return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}

__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible 

void DeviceAssign2(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
  // the same:
  // cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
  // ..

  dist.lrate_decay = hLRateDecay;
  // ..
} 

      

-2


source







All Articles