OpenCL Cholesky decomposition

I have implemented the following Cholesky decomposition algorithm using OpenCL. The code demonstrates random behavior. It prints out processor output several times. Can someone please help me figure out what is wrong with my implementation.

Here's the algorithm:

procedure CHOLESKY(A)
int i, j, k;
for k := 0 to n βˆ’ 1 do /* 1st loop */
    /* Obtain the square root of the diagonal element. */
    A[k, k] := A[k, k];

    for j := k + 1 to n βˆ’ 1 do /* 2nd loop */
    /* The division step. */
        A[k, j] := A[k, j]/A[k, k];
    end for

    for i := k + 1 to n βˆ’ 1 do /* 3rd loop */
        for j := i to n βˆ’ 1 do /* 4th loop */
            /* The elimination step. */
            A[i, j] := A[i, j] - A[k, i] Γ— A[k, j];

        end for
    end for
end for

      

Methodology for parallelizing the above algorithm:

From the algorithm, the elimination step is the most expensive. So I have the outermost loop in the main code and I call the kernel in the loop. One kernel start basically corresponds to one iteration of the 3rd loop. So I run (n-1) - (k + 1) + 1 workgroups. The number of work items in a workgroup is n / 2. The 2nd cycle is also computed in the kernel, but I only allow the first workgroup.

CORRESPONDING RUNNING CODE

// for a 10 X 10 matrix, MATRIX_SIZE = 10
localWorkSize[0] = MATRIX_SIZE/2;
stride = MATRIX_SIZE/2;
cl_event             event;

for(k = 0; k < MATRIX_SIZE; k++)
{

    int isize = (MATRIX_SIZE-1) - (k+1) + 1;
    int num_blocks = isize;
    if(num_blocks <= 0)
            num_blocks = 1;
    globalWorkSize[0] = num_blocks * WA/2;

    errcode = clSetKernelArg(clKernel, 0, sizeof(int), (void *)&k);
    errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
    errcode |= clSetKernelArg(clKernel, 2, sizeof(int), (void *)&stride);

    errcode = clEnqueueNDRangeKernel(clCommandQueue,
          clKernel, 1, NULL, globalWorkSize,
          localWorkSize, 0, NULL, &event);

    OpenCL_CheckError(errcode, "clEnqueueNDRangeKernel");
    clFinish(clCommandQueue);
}

      

TELEPHONE CODE

__kernel void
batchedCholesky(__global float *U, int k, int stride)
{

    int tx = get_global_id(0);
    unsigned int j;
    unsigned int num_rows = MATRIX_SIZE;

    if(tx==0)
    {
            // Take the square root of the diagonal element
            U[k * num_rows + k] = sqrt(U[k * num_rows + k]);
    }
    barrier(CLK_GLOBAL_MEM_FENCE);

    int offset = (k+1); //From original loop
    int jstart = get_local_id(0) + offset;
    int jstep = stride;
    int jtop = num_rows - 1;

    int jbottom = (k + 1);
    //Do work for this i iteration
    //Division step
    if(get_group_id(0) == 0)
    {
            for(j = jstart; (j >= jbottom) && (j <= jtop); j+=jstep)
            {
                    U[k * num_rows + j] /= U[k * num_rows + k]; // Division step
            }
    }
    barrier(CLK_GLOBAL_MEM_FENCE);

    j = 0;

    int i = get_group_id(0) + (k+1);

    offset = i;

    jstart = get_local_id(0) + offset;

    jbottom = i;

    for( j = jstart; j >= jbottom && j <= jtop; j += jstep)

            U[i * num_rows + j] -= U[k * num_rows + i] * U[k * num_rows + j];

    barrier(CLK_GLOBAL_MEM_FENCE);   
}

      

+3


source to share


1 answer


Not all of your work items are done at the same time, they can be done in batches. This way your code running before CLK_GLOBAL_MEM_FENCE

will not include every value. This could be the source of your mistakes.



If you require global synchronization, use multiple cores.

+1


source







All Articles