Parallelizing code with openmp

The functions below contain nested for loops. There are 3. I have given the whole function below for easy understanding. I want to parallelize the code in the inner loop as it takes the maximum CPU time. Then I can think of outer 2 for the loops. I see dependencies and internal inline functions in the innermost loop. Is it possible to rewrite the internal notation for the loop to allow parallelization using openmp pragmas. Tell me, please. I only write the loop that interests me first and then the complete function where that loop exists for reference.

Interested in parallelizing the loop mentioned below.

//* LOOP WHICH I WANT TO PARALLELIZE *//

 for (y = 0; y < 4; y++)    
 {
  refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
 }

      

The complete function this loop exists in is below for reference. / *!

***********************************************************************

* \brief

* Setup the fast search for an macroblock

***********************************************************************

*/

void SetupFastFullPelSearch (short ref, int list) // <-- reference frame parameter, list0 or 1

{

short pmv[2];
    pel_t orig_blocks[256], *orgptr=orig_blocks, *refptr, *tem; // created pointer tem
    int offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
    int LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
    int max_width, max_height;
    int img_width, img_height;
    StorablePicture *ref_picture;
    pel_t *ref_pic;


int** block_sad = BlockSAD[list][ref][7];
int search_range = max_search_range[list][ref];
int max_pos = (2*search_range+1) * (2*search_range+1);

int list_offset = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

int apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
(active_pps->weighted_bipred_idc && (img->type == B_SLICE)));

ref_picture = listX[list+list_offset][ref];

//===== Use weighted Reference for ME ====

if (apply_weights && input->UseWeightedReferenceME)

ref_pic = ref_picture->imgY_11_w;

else

ref_pic = ref_picture->imgY_11;


max_width = ref_picture->size_x - 17;
max_height = ref_picture->size_y - 17;
img_width = ref_picture->size_x;
img_height = ref_picture->size_y;

//===== get search center: predictor of 16x16 block =====

SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);

search_center_x[list][ref] = pmv[0] / 4;
search_center_y[list][ref] = pmv[1] / 4;

if (!input->rdopt)
{

//--- correct center so that (0,0) vector is inside ---

search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));

search_center_y[list][ref] = max(-search_range, min(search_range, search_center_y[list][ref]));

}

search_center_x[list][ref] += img->opix_x;
search_center_y[list][ref] += img->opix_y;
offset_x = search_center_x[list][ref];
offset_y = search_center_y[list][ref];

//===== copy original block for fast access =====

for (y = img->opix_y; y < img->opix_y+16; y++)
for (x = img->opix_x; x < img->opix_x+16; x++)

*orgptr++ = imgY_org [y][x];

//===== check if whole search range is inside image =====

if (offset_x >= search_range && offset_x <= max_width - search_range &&

offset_y >= search_range && offset_y <= max_height - search_range )

{
 range_partly_outside = 0; PelYline_11 = FastLine16Y_11;
}

else
{
 range_partly_outside = 1;
}

//===== determine position of (0,0)-vector =====

if (!input->rdopt)

{

ref_x = img->opix_x - offset_x;

ref_y = img->opix_y - offset_y;

for (pos = 0; pos < max_pos; pos++)
{
 if (ref_x == spiral_search_x[pos] &&

ref_y == spiral_search_y[pos])
  {
   pos_00[list][ref] = pos;
   break;
  }

}

  }



//===== loop over search range (spiral search): get blockwise SAD =====
**// =====THIS IS THE PART WHERE NESTED FOR STARTS=====**

for (pos = 0; pos < max_pos; pos++) // OUTERMOST FOR LOOP

{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];

if (range_partly_outside)
{
  if (abs_y >= 0 && abs_y <= max_height && abs_x >= 0 && abs_x <= max_width )
     {
      PelYline_11 = FastLine16Y_11;
     }

  else
     {
      PelYline_11 = UMVLine16Y_11;
     }
 }

orgptr = orig_blocks;
bindex = 0;

for (blky = 0; blky < 4; blky++)    // SECOND FOR LOOP
{
 LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

   for (y = 0; y < 4; y++)    //INNERMOST FOR LOOP WHICH I WANT TO PARALLELIZE
   {
     refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
     }

    block_sad[bindex++][pos] = LineSadBlk0;
    block_sad[bindex++][pos] = LineSadBlk1;
    block_sad[bindex++][pos] = LineSadBlk2;
    block_sad[bindex++][pos] = LineSadBlk3;

  }

 }


//===== combine SAD for larger block types =====

SetupLargerBlocks (list, ref, max_pos);

//===== set flag marking that search setup have been done =====

search_setup_done[list][ref] = 1;

}

#endif // _FAST_FULL_ME_

      


I rewrote the code to try and resolve dependencies in the innermost loop, i.e. for (y = 0; y <4; y ++) and many lines of LineSadBlk. Please comment if this is wrong. I think refptr and orgptr are parsed by this and dependencies are resolved, but LineSadBlk0,1,2,3 still have dependencies, as if we were running the first and second iterations in parallel, the LineSadBlk0,1,2,3 value would be taken by the thread. How to solve this please.

/*!
***********************************************************************
* \brief
*    Setup the fast search for an macroblock
***********************************************************************
*/
void SetupFastFullPelSearch (short ref, int list)  // <--  reference frame parameter,   list0 or 1
{
 short   pmv[2];
 pel_t   orig_blocks[256];
 //pel_t   *orgptr, *refptr[4];
 pel_t *orgptr[4],*refptr[4];  //defined by me new
 int     offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
 int     LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
 int     max_width, max_height;
 int     img_width, img_height;

 StorablePicture *ref_picture;
 pel_t   *ref_pic;

 int**   block_sad     = BlockSAD[list][ref][7];
 int     search_range  = max_search_range[list][ref];
 int     max_pos       = (2*search_range+1) * (2*search_range+1);

 int     list_offset   = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

 int     apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
                        (active_pps->weighted_bipred_idc && (img->type == B_SLICE)));


 ref_picture     = listX[list+list_offset][ref];

 //===== Use weighted Reference for ME ====
 if (apply_weights && input->UseWeightedReferenceME)
 ref_pic       = ref_picture->imgY_11_w;
 else
 ref_pic       = ref_picture->imgY_11;

 max_width     = ref_picture->size_x - 17;
 max_height    = ref_picture->size_y - 17;

 img_width     = ref_picture->size_x;
 img_height    = ref_picture->size_y;

 //===== get search center: predictor of 16x16 block =====
 SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);        //call 1
 search_center_x[list][ref] = pmv[0] / 4;
 search_center_y[list][ref] = pmv[1] / 4;

 if (!input->rdopt)
 {
  //--- correct center so that (0,0) vector is inside ---
  search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));
  search_center_y[list][ref] = max(-search_range, min(search_range,    search_center_y[list][ref]));
  }

  search_center_x[list][ref] += img->opix_x;
  search_center_y[list][ref] += img->opix_y;

  offset_x = search_center_x[list][ref];
  offset_y = search_center_y[list][ref];


// orgptr=orig_blocks;
orgptr[0]= orig_blocks          //all org pointers defined orig blocks
orgptr[1]= orig_blocks;
orgptr[2]= orig_blocks;
orgptr[3]= orig_blocks;



   //===== copy original block for fast access =====
  for   (y = img->opix_y; y < img->opix_y+16; y++)
  for (x = img->opix_x; x < img->opix_x+16; x++)
{         
//*orgptr++ = imgY_org [y][x];
*(orgptr[0])++ = imgY_org [y][x];                   // img stored in all orgptr
*(orgptr[1])++ = imgY_org [y][x];
*(orgptr[2])++ = imgY_org [y][x];
*(orgptr[3])++ = imgY_org [y][x];
}

   //===== check if whole search range is inside image =====
   if (offset_x >= search_range && offset_x <= max_width  - search_range &&
  offset_y >= search_range && offset_y <= max_height - search_range   )
   {
    range_partly_outside = 0; PelYline_11 = FastLine16Y_11;     //search range is fully inside image
    }
   else
   {
    range_partly_outside        //search range is partly outside image
    }

     //===== determine position of (0,0)-vector =====
   if (!input->rdopt)
   {
    ref_x = img->opix_x - offset_x;
    ref_y = img->opix_y - offset_y;

    for (pos = 0; pos < max_pos; pos++)
    {
     if (ref_x == spiral_search_x[pos] &&
      ref_y == spiral_search_y[pos])
      {
       pos_00[list][ref] = pos;
       break;
      }
    }
   }

   //===== loop over search range (spiral search): get blockwise SAD =====
  for (pos = 0; pos < max_pos; pos++)
  {
   abs_y = offset_y + spiral_search_y[pos];
   abs_x = offset_x + spiral_search_x[pos];

    if (range_partly_outside)
    {
     if (abs_y >= 0 && abs_y <= max_height &&
      abs_x >= 0 && abs_x <= max_width    )
      {
       PelYline_11 = FastLine16Y_11;            //call 2
      }
      else
      {
       PelYline_11 = UMVLine16Y_11;                     //call 3
      }
     }

    //orgptr=orig_blocks;
      orgptr[0]=orig_blocks;
  orgptr[1]=orgptr[0]+16;
  orgptr[2]=orgptr[1]+16;
  orgptr[3]=orgptr[2]+16;
      bindex = 0;


    for (blky = 0; blky < 4; blky++)
    {
      LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

       // i added the following to take refptr out of loop
  refptr[0] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
  abs_y++;
  refptr[1] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
   abs_y++;
  refptr[2] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
   abs_y++;
   refptr[3] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);          //call either 2 or 3
   abs_y++;

omp_set_num_threads(4);
#pragma omp parallel for reduction(+:LineSadBlk0,LineSadBlk1,LineSadBlk2,LineSadBlk3)
     for (y = 0; y < 4; y++)
     {

{
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    }


    {
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
  }

  }


  block_sad[bindex++][pos] = LineSadBlk0;
  block_sad[bindex++][pos] = LineSadBlk1;
  block_sad[bindex++][pos] = LineSadBlk2;
  block_sad[bindex++][pos] = LineSadBlk3;

  }
 }





 //===== combine SAD for larger block types =====
  SetupLargerBlocks (list, ref, max_pos);                          //call4


  //===== set flag marking that search setup have been done =====
  search_setup_done[list][ref] = 1;
  }
  #endif // _FAST_FULL_ME_

      

+2


source to share





All Articles