OpenCL Cholesky decomposition
I have implemented the following Cholesky decomposition algorithm using OpenCL. The code demonstrates random behavior. It prints out processor output several times. Can someone please help me figure out what is wrong with my implementation.
Here's the algorithm:
procedure CHOLESKY(A)
int i, j, k;
for k := 0 to n β 1 do /* 1st loop */
/* Obtain the square root of the diagonal element. */
A[k, k] := A[k, k];
for j := k + 1 to n β 1 do /* 2nd loop */
/* The division step. */
A[k, j] := A[k, j]/A[k, k];
end for
for i := k + 1 to n β 1 do /* 3rd loop */
for j := i to n β 1 do /* 4th loop */
/* The elimination step. */
A[i, j] := A[i, j] - A[k, i] Γ A[k, j];
end for
end for
end for
Methodology for parallelizing the above algorithm:
From the algorithm, the elimination step is the most expensive. So I have the outermost loop in the main code and I call the kernel in the loop. One kernel start basically corresponds to one iteration of the 3rd loop. So I run (n-1) - (k + 1) + 1 workgroups. The number of work items in a workgroup is n / 2. The 2nd cycle is also computed in the kernel, but I only allow the first workgroup.
CORRESPONDING RUNNING CODE
// for a 10 X 10 matrix, MATRIX_SIZE = 10
localWorkSize[0] = MATRIX_SIZE/2;
stride = MATRIX_SIZE/2;
cl_event event;
for(k = 0; k < MATRIX_SIZE; k++)
{
int isize = (MATRIX_SIZE-1) - (k+1) + 1;
int num_blocks = isize;
if(num_blocks <= 0)
num_blocks = 1;
globalWorkSize[0] = num_blocks * WA/2;
errcode = clSetKernelArg(clKernel, 0, sizeof(int), (void *)&k);
errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2, sizeof(int), (void *)&stride);
errcode = clEnqueueNDRangeKernel(clCommandQueue,
clKernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, &event);
OpenCL_CheckError(errcode, "clEnqueueNDRangeKernel");
clFinish(clCommandQueue);
}
TELEPHONE CODE
__kernel void
batchedCholesky(__global float *U, int k, int stride)
{
int tx = get_global_id(0);
unsigned int j;
unsigned int num_rows = MATRIX_SIZE;
if(tx==0)
{
// Take the square root of the diagonal element
U[k * num_rows + k] = sqrt(U[k * num_rows + k]);
}
barrier(CLK_GLOBAL_MEM_FENCE);
int offset = (k+1); //From original loop
int jstart = get_local_id(0) + offset;
int jstep = stride;
int jtop = num_rows - 1;
int jbottom = (k + 1);
//Do work for this i iteration
//Division step
if(get_group_id(0) == 0)
{
for(j = jstart; (j >= jbottom) && (j <= jtop); j+=jstep)
{
U[k * num_rows + j] /= U[k * num_rows + k]; // Division step
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
j = 0;
int i = get_group_id(0) + (k+1);
offset = i;
jstart = get_local_id(0) + offset;
jbottom = i;
for( j = jstart; j >= jbottom && j <= jtop; j += jstep)
U[i * num_rows + j] -= U[k * num_rows + i] * U[k * num_rows + j];
barrier(CLK_GLOBAL_MEM_FENCE);
}
Not all of your work items are done at the same time, they can be done in batches. This way your code running before CLK_GLOBAL_MEM_FENCE
will not include every value. This could be the source of your mistakes.
If you require global synchronization, use multiple cores.