Why aren't cudaMemcpyAsync (host to device) and CUDA core parallel?
I have uploaded an image (8 bit, unsigned char) sized 1080 x 1920
. For testing purposes, I am processing the same image 4 times using for loop
and then creating its timeline profiling.
Strategy: . I split the image into 3 parts. I made three threads to handle this whole image.
Below is a minimal working example. I'm sorry he needs an image using OpenCV, but I don't know how I can simulate the same situation without loading an image using OpenCV.
Problem: Timeline profiling shows that the first thread finished transferring data, but still the kernel assigned to it did not start. The core assigned to the first thread and the transfer of data by the third thread are parallel. So my question is, why didn't the processing of the first kernel thread start in parallel with the transfer of the second thread?
GPU: NVIDIA Quadro K2000, 3.0 compliant
Timeline profile: Each stream is assigned a different color.
My code:
__global__ void multiStream_ColorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
// Calculate our pixel location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih / 3)
{
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];
}
}
void callMultiStreamingCudaKernel(unsigned char *dev_src, unsigned char *dev_dst, int numChannels, int iw, int ih, cudaStream_t *ptr_stream)
{
dim3 numOfBlocks((iw / 20), (ih / 20)); //DON'T multiply by 3 because we have 1/3 data of image
dim3 numOfThreadsPerBlocks(20, 20);
multiStream_ColorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks, 0, *ptr_stream >> >(numChannels, iw, ih, dev_src, dev_dst);
return;
}
int main()
{
cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;
cudaStreamCreate(&stream_one);
cudaStreamCreate(&stream_two);
cudaStreamCreate(&stream_three);
Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;
int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;
unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;
//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);
//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;
//while (1)
for (int i = 0; i < 3; i++)
{
std::cout << "\nLoop: " << i;
//copy new data of image to the host pointer
h_src = image.data;
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
//KERNEL--stream-1
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
//KERNEL--stream-2
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);
//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
//KERNEL--stream-3
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);
//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);
// wait for results
cudaStreamSynchronize(stream_one);
cudaStreamSynchronize(stream_two);
cudaStreamSynchronize(stream_three);
//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}
cudaDeviceReset();
return 0;
}
UPDATE-1: If I remove the kernel call of the first thread, then the second kernel and the second version of the third thread overlap somehow (not completely) as shown below.
UPDATE-2 I even tried using 10 threads and everything remained the same. The first processing of the thread core started only after the H2D copy of the tenth thread's data.
source to share
As the commenters have already pointed out, the host memory must be page locked .
No need to allocate additional host memory via cudaHostAlloc
, you can use cudaHostRegister
on an existing OpenCV image:
cudaHostRegister(image.data, totalMemSize, cudaHostRegisterPortable)
source to share