How to combine kernel filtering and kernel processing kernel filtering into one openCL kernel
I have an image processing filter implemented using row and column processing in openCL where most of the processing is lost in running multiple cores at once.
So, can I combine these two cores into one core, which perform the same functions and perform better in Intel HD4600 graphics card. The details of the code are given below: -
Assumptions:
1. Both horizontal and vertical padding is done using the host (c-programming)
2. N (filter length 8, width and height are 1024 x 1024, filter coefficients are generated using common filters
3. First line and then Col core is started using below API
ret | = clEnqueueNDRangeKernel (command_queue, kernel, 2, NULL, global_ws (1024x1024), NULL, 0, NULL, NULL),
//Code:
__kernel void filter_rows(__global float *ip_img,__global float *op_img,
int width, int height,int pitch,int N,__constant float *W)
{
__private int i=get_global_id(0);
__private int j=get_global_id(1);
__private int k;
__private float a;
__private int image_offset = N*pitch +N;
__private int curr_pix = j*pitch + i +image_offset;
// apply filter
for(k=-N, a=0.0f; k<=N; k++)
{
a += ip_img[curr_pix+k] * W[k+N];
}
op_img[curr_pix] = a;
}
__kernel void filter_col(__global float *ip_img,__global float *op_img,int width,
int height,int pitch,int N,__constant float *W)
{
__private int i=get_global_id(0);
__private int j=get_global_id(1);
__private int k;
__private float a;
__private int image_offset = N*pitch +N;
__private int curr_pix = j*pitch + i +image_offset;
// apply filter
for(k=-N, a=0.0f; k<=N; k++)
{
a += ip_img[k*pitch +curr_pix] * W[k+N];
}
op_img[curr_pix] = a;
}
void padd_hor(float *ip_img,pad_leng)
{
//...using simple C programming
}
void padd_ver(float *ip_img,pad_leng)
{
//...using simple c programming
}
void generic_filter(_global float *in_image,__global float *out_image,
__global float *temp_image,int width, int height,int pitch,int N,
__constant float *Wr,__constant float *Wc)
{
padd_hor(in_image,filter_length)
filter_rows(in_image,temp_image,width,height,pitch,filter_length,filter_coeff_hor);
pad_ver(temp_image,filter_length)
filter_col(temp_image,out_image,width,height,pitch,filter_length,filter_coeff_ver);
}
__kernel generic_filter(_global float *in_image,__global float *out_image,__global float*temp_image,
int width, int height,int pitch,int N,__constant float *Wr,__constant float *Wc)
{
// ... here i need your suggetion to implement the kernel which do same as generic_filter
}
Your help will be appreciated to optimize this filter for the best possible result.
Also, please let me know how much the maximum gain can be gained for C code running on an Intel processor.
Thanks and Regards
Vijayky88
source to share
No one has answered this question yet
Check out similar questions: