OpenCL Cholesky decomposition
I have implemented the following Cholesky decomposition algorithm using OpenCL. The code demonstrates random behavior. It prints out processor output several times. Can someone please help me figure out what is wrong with my implementation.
Here's the algorithm:
procedure CHOLESKY(A)
int i, j, k;
for k := 0 to n β 1 do /* 1st loop */
/* Obtain the square root of the diagonal element. */
A[k, k] := A[k, k];
for j := k + 1 to n β 1 do /* 2nd loop */
/* The division step. */
A[k, j] := A[k, j]/A[k, k];
end for
for i := k + 1 to n β 1 do /* 3rd loop */
for j := i to n β 1 do /* 4th loop */
/* The elimination step. */
A[i, j] := A[i, j] - A[k, i] Γ A[k, j];
end for
end for
end for
Methodology for parallelizing the above algorithm:
From the algorithm, the elimination step is the most expensive. So I have the outermost loop in the main code and I call the kernel in the loop. One kernel start basically corresponds to one iteration of the 3rd loop. So I run (n-1) - (k + 1) + 1 workgroups. The number of work items in a workgroup is n / 2. The 2nd cycle is also computed in the kernel, but I only allow the first workgroup.
CORRESPONDING RUNNING CODE
// for a 10 X 10 matrix, MATRIX_SIZE = 10
localWorkSize[0] = MATRIX_SIZE/2;
stride = MATRIX_SIZE/2;
cl_event event;
for(k = 0; k < MATRIX_SIZE; k++)
{
int isize = (MATRIX_SIZE-1) - (k+1) + 1;
int num_blocks = isize;
if(num_blocks <= 0)
num_blocks = 1;
globalWorkSize[0] = num_blocks * WA/2;
errcode = clSetKernelArg(clKernel, 0, sizeof(int), (void *)&k);
errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2, sizeof(int), (void *)&stride);
errcode = clEnqueueNDRangeKernel(clCommandQueue,
clKernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, &event);
OpenCL_CheckError(errcode, "clEnqueueNDRangeKernel");
clFinish(clCommandQueue);
}
TELEPHONE CODE
__kernel void
batchedCholesky(__global float *U, int k, int stride)
{
int tx = get_global_id(0);
unsigned int j;
unsigned int num_rows = MATRIX_SIZE;
if(tx==0)
{
// Take the square root of the diagonal element
U[k * num_rows + k] = sqrt(U[k * num_rows + k]);
}
barrier(CLK_GLOBAL_MEM_FENCE);
int offset = (k+1); //From original loop
int jstart = get_local_id(0) + offset;
int jstep = stride;
int jtop = num_rows - 1;
int jbottom = (k + 1);
//Do work for this i iteration
//Division step
if(get_group_id(0) == 0)
{
for(j = jstart; (j >= jbottom) && (j <= jtop); j+=jstep)
{
U[k * num_rows + j] /= U[k * num_rows + k]; // Division step
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
j = 0;
int i = get_group_id(0) + (k+1);
offset = i;
jstart = get_local_id(0) + offset;
jbottom = i;
for( j = jstart; j >= jbottom && j <= jtop; j += jstep)
U[i * num_rows + j] -= U[k * num_rows + i] * U[k * num_rows + j];
barrier(CLK_GLOBAL_MEM_FENCE);
}
+3
source to share