Issue BINning results with OpenACC

Hi Support,

Hope you’re all well :-)

I’m having an issue using OpenACC to bin results. I want to use OpenACC to copy in my “bins” array to the device, increment the appropriate bins and then copy the bins back to host and print the number of hits in each bin. To do this I’ve modified the vector addition code to increment 2 bins (one for when the resultant vector array has values <= 0.2 and the other for when results are >= 0.8). The code is attached below.

On compilation, the code seems to compile but with the warning on line 17 (the line where the bins FOR loop begins):
Complex Loop carried dependence of ‘*(bins)’ prevents parallelization. Loop carried dependence due to exposed use of bins[0:2] prevents parallelization.

I thought this was a matter of * restricting the bins pointer but the problem still persists after this. I was wondering if you had any input on what could be the cause of this? Any assistance would be greatly appreciated.

Thank you and kind regards,

Paul

//Vector Addition using OpenACC

// Include header files
#include <accelmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Create Function Launch Kernel on GPU. 
void vecaddGPU( float *restrict r, float *a, float *b, int n, int *restrict bins[2]){
	#pragma acc kernels loop copyin(a[0:n],b[0:n]) copyout(r[0:n])
	for( int i = 0; i < n; ++i ){
		r[i] = a[i] + b[i];
	}
	// extra loop to bin array values
	#pragma acc kernels loop copy(bins[2])
	for( int i = 0; i < n; ++i ){
		if( r[i] <= 0.2 ){
			bins[0] = bins[0]++; //Test Floor fuction
		}
		else if ( r[i] >= 0.8 ){
			bins[1] = bins[1]++;
		}
	}	
}

int main(){
	// Declare variables
	int n=0; /* MAXIMUM vector length is n = 80000000 before out of memory*/
	float * a; /* input vector 1 */
	float * b; /* input vector 2 */
	float * r; /* output vector */
	int *bins[2] = {0}; //-------------------------------------------------->>>>>bin array for GPU
	int i;
	
	//Select number of iterations to evaluate
	printf("Enter the number of iterations: ");
	scanf("%llu", &n);
	
	// Step 6: Allocate memory for host vectors
	a = (float*)malloc( n*sizeof(float) );
	b = (float*)malloc( n*sizeof(float) );
	r = (float*)malloc( n*sizeof(float) );
	bins[2] = (int*)malloc(sizeof(int));//-------------------------------------------------->>>>>bin array memory allocated

	// Seed random numbers.
	srand(1);
	
	// Initialise Vectors to have random numbers. 
	for( i = 0; i < n; ++i ){
		a[i] = (float)rand()/(float)RAND_MAX;
		b[i] = (float)rand()/(float)RAND_MAX;
	}
	
	// Calculate resulant vector on th GPU.
	vecaddGPU( r, a, b, n, bins);//-------------------------------------------------->>>>>bin array placed into function

	// Print results
	for (i=n-11; i<n; i++){
		printf("a[%d] = %f\tb[%d] = %f\tr[%d] = %f\n",i,a[i],i,b[i],i,r[i]);
	}
	
	// Exit Program
	getch();
	
	return 0;
}

__

Hi Paul,

I thought this was a matter of * restricting the bins pointer but the problem still persists after this.

Using restrict tells the compile that pointers to different arrays don’t overlap so it doesn’t need to worry about aliasing issues. Here, there is a true dependency on the bins arrays since multiple threads could access the same elements at the same time leading to a race condition.

What you want here is to turn “bins” into two scalar reduction variables. See the code below. Note that I also updated the code to use a data region so that r isn’t being copied in between the two kernels.

Hope this helps,
Mat


% cat bins.c 
//Vector Addition using OpenACC

// Include header files
#include <accelmath.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Create Function Launch Kernel on GPU.
void vecaddGPU( float *restrict r, float *a, float *b, int n, int *restrict bins[2]){
	
   int bins0, bins1;

   #pragma acc data copyin(a[0:n],b[0:n]) copyout(r[0:n])
{
   #pragma acc kernels loop 
   for( int i = 0; i < n; ++i ){
      r[i] = a[i] + b[i];
   }
   // extra loop to bin array values
	
   bins0=0;
   bins1=0;

   #pragma acc kernels loop reduction(+:bins0,bins1)
   for( int i = 0; i < n; ++i ){
      if( r[i] <= 0.2 ){
         bins0++; //Test Floor fuction
      }
      else if ( r[i] >= 0.8 ){
         bins1++;
      }
   }
   bins[0] += bins0;
   bins[1] += bins1;
}   
}

int main(){
   // Declare variables
   int n=0; /* MAXIMUM vector length is n = 80000000 before out of memory*/
   float * a; /* input vector 1 */
   float * b; /* input vector 2 */
   float * r; /* output vector */
   int *bins[2] = {0}; //-------------------------------------------------->>>>>bin array for GPU
   int i;
   
   //Select number of iterations to evaluate
   printf("Enter the number of iterations: ");
   scanf("%llu", &n);
   
   // Step 6: Allocate memory for host vectors
   a = (float*)malloc( n*sizeof(float) );
   b = (float*)malloc( n*sizeof(float) );
   r = (float*)malloc( n*sizeof(float) );
   bins[2] = (int*)malloc(sizeof(int));//-------------------------------------------------->>>>>bin array memory allocated

   // Seed random numbers.
   srand(1);
   
   // Initialise Vectors to have random numbers.
   for( i = 0; i < n; ++i ){
      a[i] = (float)rand()/(float)RAND_MAX;
      b[i] = (float)rand()/(float)RAND_MAX;
   }
   
   // Calculate resulant vector on th GPU.
   vecaddGPU( r, a, b, n, bins);//-------------------------------------------------->>>>>bin array placed into function

   // Print results
   for (i=n-11; i<n; i++){
      printf("a[%d] = %f\tb[%d] = %f\tr[%d] = %f\n",i,a[i],i,b[i],i,r[i]);
   }
   
   // Exit Program
   //getch();
   
   return 0;
} 
% pgcc -fast -acc -Minfo=accel bins.c 
vecaddGPU:
     16, Generating copyout(r[0:n])
         Generating copyin(b[0:n])
         Generating copyin(a[0:n])
     18, Generating present_or_copyout(r[0:n])
         Generating present_or_copyin(a[0:n])
         Generating present_or_copyin(b[0:n])
         Generating NVIDIA code
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     19, Loop is parallelizable
         Accelerator kernel generated
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     24, Generating present_or_copyout(r[0:n])
         Generating NVIDIA code
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     25, Loop is parallelizable
         Accelerator kernel generated
         25, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out 
Enter the number of iterations: 1024
a[1013] = 0.094073	b[1013] = 0.048532	r[1013] = 0.142604
a[1014] = 0.551681	b[1014] = 0.594779	r[1014] = 1.146460
a[1015] = 0.726187	b[1015] = 0.637733	r[1015] = 1.363919
a[1016] = 0.775665	b[1016] = 0.152610	r[1016] = 0.928275
a[1017] = 0.684799	b[1017] = 0.082568	r[1017] = 0.767366
a[1018] = 0.980315	b[1018] = 0.572763	r[1018] = 1.553078
a[1019] = 0.135545	b[1019] = 0.032184	r[1019] = 0.167728
a[1020] = 0.449862	b[1020] = 0.717227	r[1020] = 1.167089
a[1021] = 0.815991	b[1021] = 0.515712	r[1021] = 1.331704
a[1022] = 0.504956	b[1022] = 0.565681	r[1022] = 1.070637
a[1023] = 0.819788	b[1023] = 0.742802	r[1023] = 1.562590

Note, that the code will be more efficient if you fuse the two loops:

   #pragma acc kernels loop reduction(+:bins0,bins1)  copyin(a[0:n],b[0:n]) copyout(r[0:n])
   for( int i = 0; i < n; ++i ){
      r[i] = a[i] + b[i];
      if( r[i] <= 0.2 ){
         bins0++; //Test Floor fuction
      }
      else if ( r[i] >= 0.8 ){
         bins1++;
      }
   }

Thanks for the quick replay Mat, I really appreciate it. I have a quick question though.

I added some print statements at the bottom of your modified code to see the bin results as follows:

 // Print results 
   for (i=n-11; i<n; i++){ 
      printf("a[%d] = %f\tb[%d] = %f\tr[%d] = %f\n",i,a[i],i,b[i],i,r[i]); 
   } 
   printf("Number of values <= 0.2 is:	%d\n", bins[0]);
   printf("Number of values >= 0.8 is:	%d\n", bins[1]);

When I do this I get a number of bin hits that exceeds the total number of iterations used for the vector addition, which is incorrect, For example specifying 100 iterations gives me a bins[0] result of 8 and a bins[1] result of 272 (i.e. combined values should be less than 100).

The issue appears to be located in the #pragma region where you use and augmented assignment to assign bins0 and bins1 to their respected arrays:

   bins[0] += bins0; 
   bins[1] += bins1;

If I replace the “+=” with just a standard “=”. The results appear correct but I get a "PGC-W-0095-Typecast require for this conversion" warning at these lines.

The datatype never changes for bins and bins during computation so I was wondering why this warning appears and would it be OK to ignore? If not would you know of a workaround to avoid this?

Apologies again for the trouble. Thanks again and kind regards,

Paul

The datatype never changes for bins and bins during computation so I was wondering why this warning appears and would it be OK to ignore?

Sorry, I was focusing on the accelerator loop and missed this bug in your program. If you go back to your original program, you’ll see the same issue when compiled with gcc or pgcc. “bins” is really an pointer array but you’re using it as and array of ints. An int is a different size data type than a pointer hence the typecast. Also note that bins is a 2 element pointer arrays, but you’re mallocing bins[2] which is one off the end of the array.

If not would you know of a workaround to avoid this?

Personally, I’d just make bins an int array. I’m not sure why you’re mallocing bins[2] since it’s not used. Maybe you meant to allocate each bin element to a single int? If so, see version 2 below where you need to dereference each bin element. It’s overkill though since they can be straight ints.

Version 1 where bins is an int array

% cat bins.c 
//Vector Addition using OpenACC

// Include header files
#ifdef _OPENACC
#include <accelmath.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Create Function Launch Kernel on GPU.
void vecaddGPU( float *restrict r, float *a, float *b, int n, int *restrict bins){
	
   size_t bins0, bins1;
   bins0=0;
   bins1=0;
   #pragma acc data copyin(a[0:n],b[0:n]) copyout(r[0:n])
{
   #pragma acc kernels loop
   for( int i = 0; i < n; ++i ){
      r[i] = a[i] + b[i];
   }
   #pragma acc kernels loop reduction(+:bins0,bins1) 
   for( int i = 0; i < n; ++i ){
      if( r[i] <= 0.2 ){
         bins0++; //Test Floor fuction
      }
      else if ( r[i] >= 0.8 ){
         bins1++;
      }
   }
}
   bins[0] += bins0;
   bins[1] += bins1;
}

int main(){
   // Declare variables
   int n=0; /* MAXIMUM vector length is n = 80000000 before out of memory*/
   float * a; /* input vector 1 */
   float * b; /* input vector 2 */
   float * r; /* output vector */
   int bins[2] = {0}; //-------------------------------------------------->>>>>bin array for GPU
   int i;
   
   //Select number of iterations to evaluate
   printf("Enter the number of iterations: ");
   scanf("%llu", &n);
   // Step 6: Allocate memory for host vectors
   a = (float*)malloc( n*sizeof(float) );
   b = (float*)malloc( n*sizeof(float) );
   r = (float*)malloc( n*sizeof(float) );
//   bins[2] = (int*)malloc(sizeof(int));//-------------------------------------------------->>>>>bin array memory allocated

   // Seed random numbers.
   srand(1);
   
   // Initialise Vectors to have random numbers.
   for( i = 0; i < n; ++i ){
      a[i] = (float)rand()/(float)RAND_MAX;
      b[i] = (float)rand()/(float)RAND_MAX;
   }
   
   // Calculate resulant vector on th GPU.
   vecaddGPU( r, a, b, n, bins);//-------------------------------------------------->>>>>bin array placed into function

   // Print results
   for (i=n-11; i<n; i++){
      printf("a[%d] = %f\tb[%d] = %f\tr[%d] = %f\n",i,a[i],i,b[i],i,r[i]);
   }
   
   // Exit Program
   //getch();
  
   
   printf("Number of values <= 0.2 is:   %d\n", bins[0]);
   printf("Number of values >= 0.8 is:   %d\n", bins[1]); 
 
   return 0;
} 

% gcc bins.c -std=c99
% a.out
Enter the number of iterations: 100
a[89] = 0.913027	b[89] = 0.819695	r[89] = 1.732722
a[90] = 0.359095	b[90] = 0.552485	r[90] = 0.911580
a[91] = 0.579430	b[91] = 0.452576	r[91] = 1.032006
a[92] = 0.687387	b[92] = 0.099640	r[92] = 0.787027
a[93] = 0.530808	b[93] = 0.757294	r[93] = 1.288102
a[94] = 0.304295	b[94] = 0.992228	r[94] = 1.296524
a[95] = 0.576971	b[95] = 0.877614	r[95] = 1.454585
a[96] = 0.747809	b[96] = 0.628910	r[96] = 1.376719
a[97] = 0.035421	b[97] = 0.747803	r[97] = 0.783224
a[98] = 0.833239	b[98] = 0.925377	r[98] = 1.758615
a[99] = 0.873271	b[99] = 0.831038	r[99] = 1.704309
Number of values <= 0.2 is:   1
Number of values >= 0.8 is:   76
% pgcc bins.c -acc
% a.out
Enter the number of iterations: 100
a[89] = 0.913027	b[89] = 0.819695	r[89] = 1.732722
a[90] = 0.359095	b[90] = 0.552485	r[90] = 0.911580
a[91] = 0.579430	b[91] = 0.452576	r[91] = 1.032006
a[92] = 0.687387	b[92] = 0.099640	r[92] = 0.787027
a[93] = 0.530808	b[93] = 0.757294	r[93] = 1.288102
a[94] = 0.304295	b[94] = 0.992228	r[94] = 1.296524
a[95] = 0.576971	b[95] = 0.877614	r[95] = 1.454585
a[96] = 0.747809	b[96] = 0.628910	r[96] = 1.376719
a[97] = 0.035421	b[97] = 0.747803	r[97] = 0.783224
a[98] = 0.833239	b[98] = 0.925377	r[98] = 1.758615
a[99] = 0.873271	b[99] = 0.831038	r[99] = 1.704309
Number of values <= 0.2 is:   1
Number of values >= 0.8 is:   76

Version 2 where bins in an array of int pointers

% cat bins2.c 
//Vector Addition using OpenACC

// Include header files
#ifdef _OPENACC
#include <accelmath.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Create Function Launch Kernel on GPU.
void vecaddGPU( float *restrict r, float *a, float *b, int n, int *restrict bins[2]){
	
   size_t bins0, bins1;
   bins0=0;
   bins1=0;
   #pragma acc data copyin(a[0:n],b[0:n]) copyout(r[0:n])
{
   #pragma acc kernels loop
   for( int i = 0; i < n; ++i ){
      r[i] = a[i] + b[i];
   }
   #pragma acc kernels loop reduction(+:bins0,bins1) 
   for( int i = 0; i < n; ++i ){
      if( r[i] <= 0.2 ){
         bins0++; //Test Floor fuction
      }
      else if ( r[i] >= 0.8 ){
         bins1++;
      }
   }
}
   *bins[0] += bins0;
   *bins[1] += bins1;
}

int main(){
   // Declare variables
   int n=0; /* MAXIMUM vector length is n = 80000000 before out of memory*/
   float * a; /* input vector 1 */
   float * b; /* input vector 2 */
   float * r; /* output vector */
   int * bins[2] = {0}; //-------------------------------------------------->>>>>bin array for GPU
   int i;
   
   //Select number of iterations to evaluate
   printf("Enter the number of iterations: ");
   scanf("%llu", &n);
   // Step 6: Allocate memory for host vectors
   a = (float*)malloc( n*sizeof(float) );
   b = (float*)malloc( n*sizeof(float) );
   r = (float*)malloc( n*sizeof(float) );
   bins[0] = (int*)malloc(sizeof(int));//-------------------------------------------------->>>>>bin array memory allocated
   bins[1] = (int*)malloc(sizeof(int));//-------------------------------------------------->>>>>bin array memory allocated

   // Seed random numbers.
   srand(1);
   
   // Initialise Vectors to have random numbers.
   for( i = 0; i < n; ++i ){
      a[i] = (float)rand()/(float)RAND_MAX;
      b[i] = (float)rand()/(float)RAND_MAX;
   }
   
   // Calculate resulant vector on th GPU.
   vecaddGPU( r, a, b, n, bins);//-------------------------------------------------->>>>>bin array placed into function

   // Print results
   for (i=n-11; i<n; i++){
      printf("a[%d] = %f\tb[%d] = %f\tr[%d] = %f\n",i,a[i],i,b[i],i,r[i]);
   }
   
   // Exit Program
   //getch();
  
   
   printf("Number of values <= 0.2 is:   %d\n", *bins[0]);
   printf("Number of values >= 0.8 is:   %d\n", *bins[1]); 
 
   return 0;
} 

% pgcc bins2.c -acc ; a.out
Enter the number of iterations: 100
a[89] = 0.913027	b[89] = 0.819695	r[89] = 1.732722
a[90] = 0.359095	b[90] = 0.552485	r[90] = 0.911580
a[91] = 0.579430	b[91] = 0.452576	r[91] = 1.032006
a[92] = 0.687387	b[92] = 0.099640	r[92] = 0.787027
a[93] = 0.530808	b[93] = 0.757294	r[93] = 1.288102
a[94] = 0.304295	b[94] = 0.992228	r[94] = 1.296524
a[95] = 0.576971	b[95] = 0.877614	r[95] = 1.454585
a[96] = 0.747809	b[96] = 0.628910	r[96] = 1.376719
a[97] = 0.035421	b[97] = 0.747803	r[97] = 0.783224
a[98] = 0.833239	b[98] = 0.925377	r[98] = 1.758615
a[99] = 0.873271	b[99] = 0.831038	r[99] = 1.704309
Number of values <= 0.2 is:   1
Number of values >= 0.8 is:   76

I see what you mean. That makes perfect sense!

Thanks again for you’re help Mat.

I hope all is well :-)

Paul