Why aren't cudaMemcpyAsync (host to device) and CUDA core parallel?

Question

Why aren't cudaMemcpyAsync (host to device) and CUDA core parallel?

I have uploaded an image (8 bit, unsigned char) sized 1080 x 1920

. For testing purposes, I am processing the same image 4 times using for loop

and then creating its timeline profiling.

Strategy: . I split the image into 3 parts. I made three threads to handle this whole image.

Below is a minimal working example. I'm sorry he needs an image using OpenCV, but I don't know how I can simulate the same situation without loading an image using OpenCV.

Problem: Timeline profiling shows that the first thread finished transferring data, but still the kernel assigned to it did not start. The core assigned to the first thread and the transfer of data by the third thread are parallel. So my question is, why didn't the processing of the first kernel thread start in parallel with the transfer of the second thread?

GPU: NVIDIA Quadro K2000, 3.0 compliant

Timeline profile: Each stream is assigned a different color.

My code:

__global__ void multiStream_ColorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
    // Calculate our pixel location
    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
    int y = (blockIdx.y * blockDim.y) + threadIdx.y;

    // Operate only if we are in the correct boundaries
    if (x >= 0 && x < iw && y >= 0 && y < ih / 3)
    {
        ptr_dst[numChannels*  (iw*y + x) + 0] = ptr_source[numChannels*  (iw*y + x) + 0];
        ptr_dst[numChannels*  (iw*y + x) + 1] = ptr_source[numChannels*  (iw*y + x) + 1];
        ptr_dst[numChannels*  (iw*y + x) + 2] = ptr_source[numChannels*  (iw*y + x) + 2];

    }
}

void callMultiStreamingCudaKernel(unsigned char *dev_src, unsigned char *dev_dst, int numChannels, int iw, int ih, cudaStream_t *ptr_stream)
{

    dim3 numOfBlocks((iw / 20), (ih / 20)); //DON'T multiply by 3 because we have 1/3 data of image
    dim3 numOfThreadsPerBlocks(20, 20);
    multiStream_ColorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks, 0, *ptr_stream >> >(numChannels, iw, ih, dev_src, dev_dst);

    return;
}

int main()
{

    cudaStream_t stream_one;
    cudaStream_t stream_two;
    cudaStream_t stream_three;

    cudaStreamCreate(&stream_one);
    cudaStreamCreate(&stream_two);
    cudaStreamCreate(&stream_three);

    Mat image = imread("DijSDK_test_image.jpg", 1);
    //Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
    size_t numBytes = image.rows * image.cols * 3;
    int numChannels = 3;

    int iw = image.rows;
    int ih = image.cols;
    size_t totalMemSize = numBytes * sizeof(unsigned char);
    size_t oneThirdMemSize = totalMemSize / 3;

    unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;


    //Allocate memomry at device for SOURCE and DESTINATION and get their pointers
    cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);

    //Get the processed image 
    Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
    h_dst = org_dijSDK_img.data;

    //while (1)
    for (int i = 0; i < 3; i++)
    {
        std::cout << "\nLoop: " << i;

        //copy new data of image to the host pointer
        h_src = image.data;

        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
        //KERNEL--stream-1
        callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);


        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
        //KERNEL--stream-2
        callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);

        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
        //KERNEL--stream-3
        callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);


        //RESULT copy: GPU to CPU
        cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
        cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
        cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);

        // wait for results 
        cudaStreamSynchronize(stream_one);
        cudaStreamSynchronize(stream_two);
        cudaStreamSynchronize(stream_three);

        //Assign the processed data to the display image.
        org_dijSDK_img.data = h_dst;
        //DISPLAY PROCESSED IMAGE           
        imshow("Processed dijSDK image", org_dijSDK_img);
        waitKey(33);
    }

    cudaDeviceReset();
    return 0;
}

UPDATE-1: If I remove the kernel call of the first thread, then the second kernel and the second version of the third thread overlap somehow (not completely) as shown below.

UPDATE-2 I even tried using 10 threads and everything remained the same. The first processing of the thread core started only after the H2D copy of the tenth thread's data.

+3

c ++ opencv cuda

skm 16 jul. '15 at 9:11

source to share

1 answer

ms · Accepted Answer · 2015-07-16T13:59:39+0000

As the commenters have already pointed out, the host memory must be page locked .

No need to allocate additional host memory via cudaHostAlloc

, you can use cudaHostRegister

on an existing OpenCV image:

cudaHostRegister(image.data, totalMemSize, cudaHostRegisterPortable)

Why aren't cudaMemcpyAsync (host to device) and CUDA core parallel?

More articles: