Parallelizing stationary wavelet transform using OpenACC

Thank you Mat, that solved the problem.

Hi Mat,

Now that the wavelet transform is working asynchronously, I added the next step to the parallelized program, which consists of a simple energy detector at each level of the wavelet transform. The sequential version of the program works fine and also the parallelized version works fine as long as I do not use async clause. If I use the async clause with only one queue the program returns consistent wrong results. If I use the async clause with multiple queues the program returns different wrong results each time. This is my code:

/*
 * Test file for modwt
*/

#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>

using namespace std;

// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] > val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] < val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// Return the mean of double array "in"
#pragma acc routine vector
double meanvACC(double* in, int sz) {
    
    double sum = 0;
    #pragma acc loop reduction(+:sum)
    for(int i = 0;i < sz; i++) {
	    sum = sum + in[i];
    }

    return sum / sz;
}

// Return the mean of int array "in"
//TODO parallelize
double meanvACCint(int* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

    if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
        #pragma acc loop vector
        for(int i = i1; i <= i2; i++){
            y[i-i1] = x[i];
        }
    }
}

#pragma acc routine vector
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
	
  int i;
  int J;
  int temp_len;
  int iter;
  int M;
  int lenacc;

	temp_len = len_X;
	J = level_N;
	M = 1;

#pragma acc loop vector 
	for (i = 0; i < temp_len; i++) {
		paramsOUT[i] = inp[i];
	}

  lenacc = (J + 1) * temp_len; //levLens[J + 1];
	for (iter = 0; iter < J; iter++) {
		lenacc = lenacc - temp_len;
		if (iter > 0) {
			M = 2 * M;
		}

//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
    int l;
    int i2;
    int t;
    double s;

    s = sqrt(2.0);
    #pragma acc loop vector
	  for (i2 = 0; i2 < lpd_len; i2++) {
		  filt[i2] = double(lpd[i2] / s);
		  filt[lpd_len + i2] = double(hpd[i2]  / s);
    }

#pragma acc loop vector //check if this is really parallelizable
	  for (i2 = 0; i2 < temp_len; i2++) {
		  t = i2;
		  cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
		  cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
      for (l = 1; l < lpd_len; l++) {
			  t = t - M;
			  while (t >= temp_len) {
				  t = t - temp_len;
			  }
			  while (t < 0) {
				  t = t + temp_len;
			  }

			  cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
			  cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

      }
    }
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
#pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
		    paramsOUT[i] = cA[i];
		    paramsOUT[lenacc + i] = cD[i];
		}
	}

}

//Compute the sample autocorrelation of vector "in" from 
//first lag "fl" to last lag "ll"
#pragma acc routine vector
void autocorrvACC(double* in, int szIn, int fl, int ll, double * out, int szOut){
  
    if ((ll-fl+1) <= szOut) {

        double mean = meanvACC(in,szIn);  	//Sample mean 
        double var = 0;		//Sample variance

        //Compute sample variance
        #pragma acc loop reduction(+:var)
        for (int i = 0; i < szIn; i++) {
	        var += (in[i]-mean)*(in[i]-mean);
        }
        var = var / szIn;

        //Compute sample-ACF
        #pragma acc loop vector
        for(int i = fl;i <= ll; i++) {// Lag loop	
	        out[i-fl] = 0;

	        for(int j = i;j < szIn; j++) { //Dot product
	          out[i-fl] += (in[j] - mean) * (in[j-i] - mean); 
          }
	        out[i-fl] = out[i-fl] / (var * szIn);
        }
    }
}

#pragma acc routine vector
void powvACC(double * in, int szIn, int n, double * out, int szOut) {

    if (szIn <= szOut) {
        #pragma acc loop vector
        for(int i = 0;i < szIn; i++) {
	        out[i] = pow(in[i], n);
        }
    }
}



int main (int argc, char *argv[]) {
    
  double gtBeg = omp_get_wtime(); //Global time
    
  //Variables***************************************************
  double * x;                   	//noisy signal
  int xSz;                  //size of variable x
  double fs;                 	//sample frequency (Hz)
  double fc = 2000;          	//cut-off frequency (Hz)
  double wsz = 3;             //window size (milliseconds)
  int ln = 4;               	//number of levels
 	double energy;			        //signal energy computed at each level/window
  double startP, endP, durationP; //time 
  //***********************************************************
     
  //Open a WAV*************************************************
  FILE* wavf;
  SF_INFO info;
  char* fileName;
  
  if (argc == 2) {
    fileName = argv[1];
	  printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
  }
  else {
    fileName = "aud/9manatees.wav";
    printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
  }

  wavf = fopen(fileName, "r");
    
  if(wavf != NULL){   //Succesful wav read

    SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);	

    fs = info.samplerate;
    xSz = info.frames;
    x = new double[xSz];
    sf_read_double(source, x, xSz);
    sf_close(source);
    fclose(wavf);

    //Denoising method begins*****************************************************        
    printf("\nExecuting parallel wavelet... \n\n");		
	      
    //Process constants-------------------
    wave_object family = wave_init("db8");  //wavelet family
    double ZERO_CONST =  pow(10,-14);       //represents minimum signal energy 
    double wszNS = llround(wsz*0.001*fs);  	//window size in # samples
    int wszINT = wszNS;
    //--------------------------------------
    
   	//Initialize Variables-------------------------------------------------
    double alagBegNS = round(0.21 *0.001*fs);     //autocorrelation lag beggining
    double alagEndNS = round(1.25 *0.001*fs);     //autoorrelation lag end
 		double * inter_vec = new double[xSz];	//intermediate output vector
    double dt = 1. / fs;		//time step in seconds
  	double lastIdx =0;		//last index of processed portion of the signal
  	wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
  
  	//DEBUG - Print basic info -----------------------------------
  	printf("\nProcess constants ----------------- \n");
    printf("Signal length: %d\n", xSz);
  	printf("wszNS: %g samples\n", wszNS);
    printf("Alag in number of smaples: [%f,%f]\n",alagBegNS,alagEndNS);
  	printf("----------------------------------- \n\n");
  	//--------------------------------------------------------------------	

  	//Allocate all memory before OpenACC region
    
    //Size of vectors in number of samples 	
  	int svSz = wszINT;     //size of temporal subvector
    int wsvSz = wszINT*(ln+1); //size of wavelet coefficients of the temporal subvector
  	int wvSz = xSz*(ln+1); //size of wavelet coefficients of the entire signal
    int avSz = alagEndNS - alagBegNS + 1; //size of autocorrelation vector
    int tfbSz = wszINT;
    
    //Size of blocks and windows
    int bSz = 128; //block size in number of windows
    int nWs = floor(xSz/wszINT);  //number of windows
    int nBlocks = ceil( floor(xSz/wszINT) / bSz );//number of blocks 
    int resWsz = nWs%bSz; //residual of the # of windows divided by the block size
    
    
  	//OpenACC variables-------------------------------------------------------------------------------------
  	double * restrict sub_vecACC = new double[svSz]; //a temporal subvector of the input signal      
  	double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
    double * restrict w_vecACC = new double[wvSz]; //UDWT of the entire signal
    double * restrict tfbACC = new double[tfbSz];   //a temporal vector to store one frequency channel of sub-vector sub_vecACC
    printf("\nDebug checkpoint #-1\n");
    double * restrict ACF_vecACC = new double[avSz] ;        //vector that store the autocorrelation function result         
    printf("\nDebug checkpoint #0\n");
    double ** restrict rms_vecACC = new double*[ln+1];       //rms value matrix of ACF at each level of each time window
    //------------------------------------------------------------------------------------------------------
     
     	printf("\nDebug checkpoint #1\n");
    //Initialization of matrix  
    for(int i = 0; i < ln+1; i++){
      rms_vecACC[i] = new double[nWs];
      /*for (int j = 0; j < nWs; j++) { Not necessary
        rms_vecACC[i][j] = 0;
      }*/
    }
     
    //Initialization of wavelet coefficient's vector 
    for (int yy = 0; yy < xSz*(ln+1); yy++){
      w_vecACC[yy] = 0;
    }

    //Input parameters of modwtACC2-------------------------------------
    int len_X = wt->siglength;          //Length of input signal X
    int level_N = wt->J;                //Number of levels
    int len_filt = wt->wave->filtlength;//Wave-filter length     
    int * levLens = wt->length;         //Length of each level
    double * lpd = wt->wave->lpd;       //Low Pass Details   
    int lpd_len = wt->wave->lpd_len;    //lpd length     
    double * hpd = wt->wave->hpd;       //High Pass Details   
    //Dynamic memory inputs: allocate and free memory outside ACC-loop 
    double * cA = new double[len_X]; //free(cA);
    double * cD = new double[len_X]; //free(cD);
    double * filt = new double[2 * lpd_len]; //free(filt)
    //Output parameters:
    double * paramsOUT = new double[wsvSz];        
    //------------------------------------------------------------------

    
    //----------------------------------------------------------------
    printf("\nBEFORE ACC REGION ----------\n");
    printf("Number of windows: %d\n",nWs);
    printf("Number of blocks: %d\n",nBlocks);
    printf("Residual of windows: %d\n",resWsz);
    printf("w_vecACC size: %d\n", xSz*(ln+1));
    printf("w_vecACC update operation's size %d\n", (xSz - wszINT)*(ln+1));
    printf("lpd length: %d\n", lpd_len);
    printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
    printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
    printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
    printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
    
    //Set levLens before cycle-------------------------
    levLens[level_N +1]= (level_N +1)*len_X;
    for (int i = 0; i < level_N+1; i++) {
        levLens[i] = len_X;
    }   
    //--------------------------------------------------       
    printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));        
    //----------------------------------------------------------------------------


    //DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
    ifstream myFile; //DEBUG CODE
	      myFile.open ("sweepEner.txt"); //DEBUG CODE
    int lineCnt = 0;
    string line;
    while (getline(myFile,line))
        ++lineCnt;
    myFile.clear();
    myFile.seekg(0,ios::beg);

    double * fileCoef = new double[lineCnt];
    for (int i=0; i <  lineCnt; i++) {
         myFile >>  fileCoef[i]; //DEBUG CODE
    }
	      myFile.close(); //DEBUG CODE
    
    //----------------------------------------------------------------------------------
    // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
    	ofstream myfile2;
      myfile2.open ("sweep2Ener.txt"); //DEBUG CODE*/
      startP = omp_get_wtime();
    //----------------------------------------------------------------------------------

#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len]), copyout(rms_vecACC[:ln+1][:nWs]), create(w_vecACC[:wvSz],inter_vecACC[:xSz]) //**ASYNC COPYIN**          
//#pragma acc data copyin(inter_vecACC[:xSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
{
          for (int q = 0; q < nBlocks; q++) { //For each block of windows
            int begI = q * bSz * wszINT; //block beggining
            int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
            /*printf("wszINT: %d\n", wszINT);
            printf("wszNS: %f\n", wszNS);
            printf("begI: %d\n", begI);
            printf("endI: %d\n", endI);
            printf("wsvSz: %d\n", wsvSz);*/

#pragma acc update device(inter_vecACC[begI:endI-begI]) async//(q%4) //**ASYNC COPYIN**             
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async//(q%4) //**ASYNC COPYIN**             
//#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //num_gangs(2056),num_workers(1),vector_length(32) 
            for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
      		    
              int val = i + wszINT -1; //added to avoid a pgi compiler's bug
              //Get subvector	(window)	
        		  getSubvACC(inter_vecACC, xSz,i,val,sub_vecACC, svSz);
  
        		  //Apply UDWT to window
              modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		  //Store UDWT coefficients
              #pragma acc loop
              for (int j=0; j < wsvSz; j++) {
  	    	      w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	          }
               
              //ACF and rms value computation for each level
              #pragma acc loop private(tfbACC[:tfbSz],ACF_vecACC[:avSz],energy,i)            
              for (int j=0;j < ln+1; j++) {
          		    
      		      energy = 0;
          			
                #pragma acc loop reduction(+:energy)
          		  for (int k=0; k < wszINT; k++) { //Obtain level j's coefs and estimate signal energy		
          			  tfbACC[k] = paramsOUT[k + (j*wszINT)];
          			  energy += pow(tfbACC[k],2); 
          		  }         
          		  energy = sqrt(energy / wszNS);  //signal energy in window "i" and level "j"
                rms_vecACC[j][i/wszINT] = energy;
          		  /*if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
          			  rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level
          		  }
          		  else { 					                                                
                  autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS, ACF_vecACC,avSz); //TODO change parameters
			            powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);		
	                rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range                                   
          		  }*/
   		        }
            }     				                                  
          //Update block of data
          #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
          }
}
          #pragma acc wait //ASYNC wait  
       	endP = omp_get_wtime();
 	      durationP = endP - startP;
        
        printf("\nAFTER ACC REGION ----------\n");
        printf("Parallel execution time of OpenACC region: %f seconds\n", durationP);                            
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));            

        printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
        printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));	    
        printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
         printf("line_Cnt is: %d and rms_vecACC size is: %d\n",lineCnt, nWs*(ln+1));         

        //Compare wavelet coefficients----------------
        double mae = 0;
        double sigSum = 0;
        double sigSum2 = 0;
        /*for (int i = 0; i < wvSz; i++) { //Comparison for 1 dimensional vectors
           myfile2 << w_vecACC[i] << "\n"; 
           mae += fabs(fileCoef[i] - w_vecACC[i]);
           sigSum += fabs(fileCoef[i]);
           sigSum2 += fabs(w_vecACC[i]);
        }*/
        for (int j = 0;j<ln+1; j++) {
          for (int i = 0;i<nWs; i++) { //Comparison for 2 dimensional vectors
            myfile2 << rms_vecACC[j][i] << "\n"; 
            mae += fabs(fileCoef[j*nWs + i] -  rms_vecACC[j][i]);
            sigSum += fabs(fileCoef[j*nWs + i]);
            sigSum2 += fabs(rms_vecACC[j][i]);
          }
        }
        
        sigSum = sigSum / lineCnt;
        sigSum2 = sigSum2 / lineCnt;
        mae = mae / lineCnt;
        printf("MAE: %f\n", mae);
        printf("Original SigSum: %f\n", sigSum);
        printf("Obtained SigSum: %f\n", sigSum2);
        //--------------------------------------------
        myfile2.close();
        
        //Free used memory in ACC cycle
        delete [] fileCoef;
        
        //TODO delete rms structure
        for (int i = 0;i < ln+1; i++){
          delete [] rms_vecACC[i];
        }
        delete [] rms_vecACC;
        delete [] tfbACC;
        delete [] inter_vec;
        delete [] sub_vecACC;
        delete [] paramsOUT; 
        delete [] cA;
        delete [] cD;
        delete [] filt;
        delete [] w_vecACC;
      
        wt_free(wt);
               
        //---------------------------------------------------------------------------
        //---------------------------------------------------------------------------

        delete[] x;

    }
    double gtEnd = omp_get_wtime(); //Global time
    double gtDur = gtEnd - gtBeg;
    printf("Global execution time: %f seconds\n", gtDur);                            
       
    return 0;
}

I’m executing the program with the same previously used audio file “audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav”, so the execution line is something like:

./pgimexa2 sweep.wav

And the compilation line also remains the same:

pgc++ -lsndfile -fast -ta=tesla -Minfo=accel pgimexa.2.cpp -o pgimexa2 libwavelib2.a

To compare if the obtained results are fine just download the text file sweepEner.txt at my GitHub: AMCM/sweepEner.txt at master · j-castro/AMCM · GitHub. So, if everything is fine the printed Mean Absolute Error (MAE) should be zero always. So finally the question is: Am I doing something wrong when executing asynchronously this code or is this some kind of compiler bug or something?

For good or bad, I’m not able to recreate the error. Granted I’m running on a P100, but I doubt it would be different than a K40.

Here’s my output below. I do see a MAE of 0.0 and a diff of sweep2Ener.txt to one that I created with a CPU only run match.

Note that I did download the sweepEner.txt file.

Did I miss something?

% grep async pgimexa.2.cpp                                                             #pragma acc update device(inter_vecACC[begI:endI-begI]) async(q%4) //**ASYNC COPYIN**
 #pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //**ASYNC COPYIN**
 //#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) asyn (q%4) //num_gangs(2056),num_workers(1),vector_length(32)
           #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
% pgc++ -L../libsndfile/lib/ -I../libsndfile/include -I. -lsndfile -fast -ta=tesla:cc60 pgimexa.2.cpp -o pgimexa2 libwavelib.a -w -V17.4
pgimexa.2.cpp:
% pgc++ -L../libsndfile/lib/ -I../libsndfile/include -I. -lsndfile -fast -ta=tesla:cc60 pgimexa.2.cpp -o pgimexa2 libwavelib.a -w -V17.4
pgimexa.2.cpp:
% pgimexa2 audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav                          
<<<Opening file: audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------


Debug checkpoint #-1

Debug checkpoint #0

Debug checkpoint #1

BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000

AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 0.804450 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000000
Original SigSum: 0.127271
Obtained SigSum: 0.127271
Global execution time: 0.968645 seconds

Accelerator Kernel Timing data
    Timing may be affected by asynchronous behavior
    set PGI_ACC_SYNCHRONOUS to 1 to disable async() clauses
/local/home/colgrove/wavelib/src/pgimexa.2.cpp
  main  NVIDIA  devicenum=0
    time(us): 44,176
    367: data region reached 2 times
        34: kernel launched 5 times
            grid: [1]  block: [128]
             device time(us): total=11 max=3 min=2 avg=2
            elapsed time(us): total=690 max=581 min=22 avg=138
        367: data copyin transfers: 2
             device time(us): total=31 max=24 min=7 avg=15
        421: data copyout transfers: 5
             device time(us): total=63 max=15 min=12 avg=12
    370: update directive reached 79 times
        370: data copyin transfers: 79
             device time(us): total=2,522 max=42 min=13 avg=31
    370: compute region reached 79 times
        370: kernel launched 79 times
            grid: [16-128]  block: [32]
             device time(us): total=32,036 max=412 min=392 avg=405
            elapsed time(us): total=34,059 max=459 min=421 avg=431
    420: update directive reached 79 times
        420: data copyout transfers: 79
             device time(us): total=9,513 max=134 min=22 avg=120

-Mat

It looks it is ok, what is your compilation’s output? my compilation’s output is:

pgc++ -lsndfile -fast -ta=tesla -Minfo=accel pgimexa.2.cpp -o pgimexa2 libwavelib2.a
pgimexa.2.cpp:
"wavelib/wavefilt.h", line 31: warning: last line of file ends without a
          newline
  #endif /* WAVEFILT_H_ */
                          ^

"wavelib/cwtmath.h", line 13: warning: allowing all exceptions is incompatible
          with previous function "gamma" (declared at line 265 of
          "/usr/include/bits/mathcalls.h")
  double gamma(double x);
                        ^

"pgimexa.2.cpp", line 217: warning: conversion from a string literal to "char
          *" is deprecated
      fileName = "aud/9manatees.wav";
               ^

"pgimexa.2.cpp", line 238: warning: conversion from a string literal to "char
          *" is deprecated
      wave_object family = wave_init("db8");  //wavelet family
                                     ^

"pgimexa.2.cpp", line 250: warning: conversion from a string literal to "char
          *" is deprecated
        wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
                                      ^

"pgimexa.2.cpp", line 248: warning: variable "dt" was declared but never
          referenced
      double dt = 1. / fs;              //time step in seconds
             ^

"pgimexa.2.cpp", line 249: warning: variable "lastIdx" was declared but never
          referenced
        double lastIdx =0;              //last index of processed portion of the signal
               ^

"pgimexa.2.cpp", line 200: warning: variable "fc" was declared but never
          referenced
    double fc = 2000;           //cut-off frequency (Hz)
           ^

meanvACC(double *, int):
     43, Generating Tesla code
         47, #pragma acc loop vector /* threadIdx.x */
     47, Loop is parallelizable
getSubvACC(double *, int, int, int, double *, int):
     69, Generating Tesla code
         73, #pragma acc loop vector /* threadIdx.x */
     73, Loop is parallelizable
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *, double *, double *):
     80, Generating Tesla code
         94, #pragma acc loop vector /* threadIdx.x */
         99, #pragma acc loop seq
        114, #pragma acc loop vector /* threadIdx.x */
        120, #pragma acc loop vector /* threadIdx.x */
        124, #pragma acc loop seq
        126, #pragma acc loop seq
        129, #pragma acc loop seq
        141, #pragma acc loop vector /* threadIdx.x */
     94, Loop is parallelizable
     99, Loop carried scalar dependence for M at line 102,125,80
    114, Loop is parallelizable
    120, Loop is parallelizable
    124, Loop carried scalar dependence for t at line 125,80
         Complex loop carried dependence of paramsOUT->,filt->,cA->,cD-> prevents parallelization
         Loop carried reuse of cD->,cA-> prevents parallelization
    127, Accelerator restriction: induction variable live-out from loop: t
    128, Accelerator restriction: induction variable live-out from loop: t
    131, Accelerator restriction: induction variable live-out from loop: t
    141, Loop is parallelizable
autocorrvACC(double *, int, int, int, double *, int):
    152, Generating Tesla code
        161, #pragma acc loop vector /* threadIdx.x */
        168, #pragma acc loop vector /* threadIdx.x */
        171, #pragma acc loop seq
    161, Loop is parallelizable
    168, Loop is parallelizable
    171, Complex loop carried dependence of in->,out-> prevents parallelization
         Loop carried reuse of out-> prevents parallelization
         Loop carried dependence of out-> prevents parallelization
         Loop carried backward dependence of out-> prevents vectorization
powvACC(double *, int, int, double *, int):
    180, Generating Tesla code
        184, #pragma acc loop vector /* threadIdx.x */
    184, Loop is parallelizable
main:
    366, Generating create(inter_vecACC[:xSz])
         Generating copyin(lpd[:lpd_len])
         Generating create(w_vecACC[:wvSz])
         Generating copyout(rms_vecACC[:ln+1][:nWs])
         Generating copyin(hpd[:lpd_len])
    369, Generating update device(inter_vecACC[begI:endI-begI])
         Accelerator kernel generated
         Generating Tesla code
        379, #pragma acc loop gang /* blockIdx.x */
        390, #pragma acc loop vector(32) /* threadIdx.x */
        396, #pragma acc loop seq
        401, #pragma acc loop vector(32) /* threadIdx.x */
             Generating reduction(+:energy)
    390, Loop is parallelizable
    396, Accelerator restriction: size of the GPU copy of paramsOUT is unknown
         Loop is parallelizable
    401, Loop is parallelizable
    419, Generating update self(w_vecACC[(ln+1)*begI:(ln+1)*(endI-begI)])

Looks exactly the same.

Can you test it with a k40c? Because I’m running the same code and my output is:

./pgimexa2 sweep.wav

<<<Opening file: sweep.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------


Debug checkpoint #-1

Debug checkpoint #0

Debug checkpoint #1

BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000

AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 2.070275 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000138
Original SigSum: 0.127271
Obtained SigSum: 0.127134
Global execution time: 2.325482 seconds

Also, If I comment out the “async” clause I get correct answers:



./pgimexa2 sweep.wav

<<<Opening file: sweep.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------


Debug checkpoint #-1

Debug checkpoint #0

Debug checkpoint #1

BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000

AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 2.053668 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000000
Original SigSum: 0.127271
Obtained SigSum: 0.127271
Global execution time: 2.303171 seconds

So, the question is: Is there any problem in the way I’m using the async clause? Otherwise, it seems like the problem is related to the use of k40c GPU.

Did you try running it on a K80 and with the PGI community compiler???

Yes, and did get incorrect answers but I’m still try to determine what’s wrong, at least for some runs. Running the code repeatedly seems to sometimes get correct answers.

Note that I did find if I switched to using CUDA Unified Memory (-ta=tesla:managed), I get correct answers.

Also, if I set the environment varialbe “PGI_ACC_FILL=1” which initialize the GPU memory to zero, I’ll get consistent albeit wrong results (MAE = 0.001981).

Everything seems to point to some type of data race but still not sure where or why.

I’ll continue to review as a background task but I have a major project that I’m working on so it may be a day or so before I can get back to it.

-Mat

Thank you Mat, finally I solved it! I was reading the book: “Parallel Programming with OpenACC” and find out that the wait clause had to be placed inside the data region, so at the end of the OpenACC region I changed this:

.
.
.
//Update block of data 
          #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT] 
          } 
} 
          #pragma acc wait //ASYNC wait
.
.
.

for this:

.
.
.
//Update block of data 
          #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT] 
          } 
          #pragma acc wait //ASYNC wait
} 
.
.
.

and everything worked perfectly and asynchronously!

Ugh, I don’t know why I missed that! Glad you were able to solve it.

-Mat

Hi Mat,

Again I have a strange bug that arises when using openACC. I just added 8 new lines of code in the OpenACC region, where I calculate the autocorrelation function and the RMS value if an energy threshold condition is satisfied:

if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
    rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level 
}
else { 					                                                 
    autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS,ACF_vecACC,avSz); //TODO change parameters
    powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);
    rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range                                   
}

This code works if I compile it for multicore or sequential-GPU (using num_gangs(1), vector_length(1)), otherwise, it produces consistent wrong results. To quickly test the correctness of the code I use the files “9manatees.wav” and “manateeRMS.txt” that you can download at my GitHub GitHub - j-castro/AMCM: Automatic manatee count method. So again, the Mean Absolute Error (MAE) should be equal to zero.

This is the complete code:

/*
 * Test file for modwt
*/

#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>

using namespace std;

// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] > val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] < val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// Return the mean of double array "in"
#pragma acc routine vector
double meanvACC(double* in, int sz) {
    
    double sum = 0;
    #pragma acc loop reduction(+:sum)
    for(int i = 0;i < sz; i++) {
	    sum = sum + in[i];
    }

    return sum / sz;
}

// Return the mean of int array "in"
//TODO parallelize
double meanvACCint(int* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

    if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
        #pragma acc loop vector
        for(int i = i1; i <= i2; i++){
            y[i-i1] = x[i];
        }
    }
}

#pragma acc routine vector
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
	
  int i;
  int J;
  int temp_len;
  int iter;
  int M;
  int lenacc;

	temp_len = len_X;
	J = level_N;
	M = 1;

#pragma acc loop vector 
	for (i = 0; i < temp_len; i++) {
		paramsOUT[i] = inp[i];
	}

  lenacc = (J + 1) * temp_len; //levLens[J + 1];
	for (iter = 0; iter < J; iter++) {
		lenacc = lenacc - temp_len;
		if (iter > 0) {
			M = 2 * M;
		}

//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
    int l;
    int i2;
    int t;
    double s;

    s = sqrt(2.0);
    #pragma acc loop vector
	  for (i2 = 0; i2 < lpd_len; i2++) {
		  filt[i2] = double(lpd[i2] / s);
		  filt[lpd_len + i2] = double(hpd[i2]  / s);
    }

#pragma acc loop vector //check if this is really parallelizable
	  for (i2 = 0; i2 < temp_len; i2++) {
		  t = i2;
		  cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
		  cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
      for (l = 1; l < lpd_len; l++) {
			  t = t - M;
			  while (t >= temp_len) {
				  t = t - temp_len;
			  }
			  while (t < 0) {
				  t = t + temp_len;
			  }

			  cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
			  cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

      }
    }
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
#pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
		    paramsOUT[i] = cA[i];
		    paramsOUT[lenacc + i] = cD[i];
		}
	}

}

//Compute the sample autocorrelation of vector "in" from 
//first lag "fl" to last lag "ll"
#pragma acc routine vector
void autocorrvACC(double* in, int szIn, int fl, int ll, double * out, int szOut){
  
    if ((ll-fl+1) <= szOut) {

        double mean = meanvACC(in,szIn);  	//Sample mean 
        double var = 0;		//Sample variance

        //Compute sample variance
        #pragma acc loop reduction(+:var)
        for (int i = 0; i < szIn; i++) {
	        var += (in[i]-mean)*(in[i]-mean);
        }
        var = var / szIn;

        //Compute sample-ACF
        #pragma acc loop vector
        for(int i = fl;i <= ll; i++) {// Lag loop	
	        out[i-fl] = 0;

	        for(int j = i;j < szIn; j++) { //Dot product
	          out[i-fl] += (in[j] - mean) * (in[j-i] - mean); 
          }
	        out[i-fl] = out[i-fl] / (var * szIn);
        }
    }
}

#pragma acc routine vector
void powvACC(double * in, int szIn, int n, double * out, int szOut) {

    if (szIn <= szOut) {
        #pragma acc loop vector
        for(int i = 0;i < szIn; i++) {
	        out[i] = pow(in[i], n);
        }
    }
}



int main (int argc, char *argv[]) {
    
  double gtBeg = omp_get_wtime(); //Global time
    
  //Variables***************************************************
  double * x;                   	//noisy signal
  int xSz;                  //size of variable x
  double fs;                 	//sample frequency (Hz)
  double fc = 2000;          	//cut-off frequency (Hz)
  double wsz = 3;             //window size (milliseconds)
  int ln = 4;               	//number of levels
 	double energy;			        //signal energy computed at each level/window
  double startP, endP, durationP; //time 
  //***********************************************************
     
  //Open a WAV*************************************************
  FILE* wavf;
  SF_INFO info;
  char* fileName;
  
  if (argc == 2) {
    fileName = argv[1];
	  printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
  }
  else {
    fileName = "aud/9manatees.wav";
    printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
  }

  wavf = fopen(fileName, "r");
    
  if(wavf != NULL){   //Succesful wav read

    SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);	

    fs = info.samplerate;
    xSz = info.frames;
    x = new double[xSz];
    sf_read_double(source, x, xSz);
    sf_close(source);
    fclose(wavf);

    //Denoising method begins*****************************************************        
    printf("\nExecuting parallel wavelet... \n\n");		
	      
    //Process constants-------------------
    wave_object family = wave_init("db8");  //wavelet family
    double ZERO_CONST =  pow(10,-14);       //represents minimum signal energy 
    double wszNS = llround(wsz*0.001*fs);  	//window size in # samples
    int wszINT = wszNS;
    //--------------------------------------
    
   	//Initialize Variables-------------------------------------------------
    double alagBegNS = round(0.21 *0.001*fs);     //autocorrelation lag beggining
    double alagEndNS = round(1.25 *0.001*fs);     //autoorrelation lag end
 		double * inter_vec = new double[xSz];	//intermediate output vector
    double dt = 1. / fs;		//time step in seconds
  	double lastIdx =0;		//last index of processed portion of the signal
  	wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
  
  	//DEBUG - Print basic info -----------------------------------
  	printf("\nProcess constants ----------------- \n");
    printf("Signal length: %d\n", xSz);
  	printf("wszNS: %g samples\n", wszNS);
    printf("Alag in number of smaples: [%f,%f]\n",alagBegNS,alagEndNS);
  	printf("----------------------------------- \n\n");
  	//--------------------------------------------------------------------	

  	//Allocate all memory before OpenACC region
    
    //Size of vectors in number of samples 	
  	int svSz = wszINT;     //size of temporal subvector
    int wsvSz = wszINT*(ln+1); //size of wavelet coefficients of the temporal subvector
  	int wvSz = xSz*(ln+1); //size of wavelet coefficients of the entire signal
    int avSz = alagEndNS - alagBegNS + 1; //size of autocorrelation vector
    int tfbSz = wszINT;
    
    //Size of blocks and windows
    int bSz = 128; //block size in number of windows
    int nWs = floor(xSz/wszINT);  //number of windows
    int nBlocks = ceil( floor(xSz/wszINT) / bSz );//number of blocks 
    int resWsz = nWs%bSz; //residual of the # of windows divided by the block size
    
    
  	//OpenACC variables-------------------------------------------------------------------------------------
  	double * restrict sub_vecACC = new double[svSz]; //a temporal subvector of the input signal      
  	double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
    double * restrict w_vecACC = new double[wvSz]; //UDWT of the entire signal
    double * restrict tfbACC = new double[tfbSz];   //a temporal vector to store one frequency channel of sub-vector sub_vecACC
    printf("\nDebug checkpoint #-1\n");
    double * restrict ACF_vecACC = new double[avSz] ;        //vector that store the autocorrelation function result         
    printf("\nDebug checkpoint #0\n");
    double ** restrict rms_vecACC = new double*[ln+1];       //rms value matrix of ACF at each level of each time window
    //------------------------------------------------------------------------------------------------------
     
     	printf("\nDebug checkpoint #1\n");
    //Initialization of matrix  
    for(int i = 0; i < ln+1; i++){
      rms_vecACC[i] = new double[nWs];
      /*for (int j = 0; j < nWs; j++) { Not necessary
        rms_vecACC[i][j] = 0;
      }*/
    }
     
    //Initialization of wavelet coefficient's vector 
    for (int yy = 0; yy < xSz*(ln+1); yy++){
      w_vecACC[yy] = 0;
    }

    //Input parameters of modwtACC2-------------------------------------
    int len_X = wt->siglength;          //Length of input signal X
    int level_N = wt->J;                //Number of levels
    int len_filt = wt->wave->filtlength;//Wave-filter length     
    int * levLens = wt->length;         //Length of each level
    double * lpd = wt->wave->lpd;       //Low Pass Details   
    int lpd_len = wt->wave->lpd_len;    //lpd length     
    double * hpd = wt->wave->hpd;       //High Pass Details   
    //Dynamic memory inputs: allocate and free memory outside ACC-loop 
    double * cA = new double[len_X]; //free(cA);
    double * cD = new double[len_X]; //free(cD);
    double * filt = new double[2 * lpd_len]; //free(filt)
    //Output parameters:
    double * paramsOUT = new double[wsvSz];        
    //------------------------------------------------------------------

    
    //----------------------------------------------------------------
    printf("\nBEFORE ACC REGION ----------\n");
    printf("Number of windows: %d\n",nWs);
    printf("Number of blocks: %d\n",nBlocks);
    printf("Residual of windows: %d\n",resWsz);
    printf("w_vecACC size: %d\n", xSz*(ln+1));
    printf("w_vecACC update operation's size %d\n", (xSz - wszINT)*(ln+1));
    printf("lpd length: %d\n", lpd_len);
    printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
    printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
    printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
    printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
    
    //Set levLens before cycle-------------------------
    levLens[level_N +1]= (level_N +1)*len_X;
    for (int i = 0; i < level_N+1; i++) {
        levLens[i] = len_X;
    }   
    //--------------------------------------------------       
    printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));        
    //----------------------------------------------------------------------------


    //DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
    ifstream myFile; //DEBUG CODE
	      myFile.open ("manateeRMS.txt"); //DEBUG CODE
    int lineCnt = 0;
    string line;
    while (getline(myFile,line))
        ++lineCnt;
    myFile.clear();
    myFile.seekg(0,ios::beg);

    double * fileCoef = new double[lineCnt];
    for (int i=0; i <  lineCnt; i++) {
         myFile >>  fileCoef[i]; //DEBUG CODE
    }
	      myFile.close(); //DEBUG CODE
    
    //----------------------------------------------------------------------------------
    // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
    	ofstream myfile2;
      myfile2.open ("manatee2RMS.txt"); //DEBUG CODE*/
      startP = omp_get_wtime();
    //----------------------------------------------------------------------------------

#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len]), copyout(rms_vecACC[:ln+1][:nWs]), create(w_vecACC[:wvSz],inter_vecACC[:xSz]) //**ASYNC COPYIN**          
//#pragma acc data copyin(inter_vecACC[:xSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
{
          for (int q = 0; q < nBlocks; q++) { //For each block of windows
            int begI = q * bSz * wszINT; //block beggining
            int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
            /*printf("wszINT: %d\n", wszINT);
            printf("wszNS: %f\n", wszNS);
            printf("begI: %d\n", begI);
            printf("endI: %d\n", endI);
            printf("wsvSz: %d\n", wsvSz);*/

#pragma acc update device(inter_vecACC[begI:endI-begI]) async (q%4) //**ASYNC COPYIN**             
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async (q%4) //**ASYNC COPYIN** //num_gangs(1), vector_length(1)          
//#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //num_gangs(2056),num_workers(1),vector_length(32) 
            for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
      		    
              int val = i + wszINT -1; //added to avoid a pgi compiler's bug
              //Get subvector	(window)	
        		  getSubvACC(inter_vecACC, xSz,i,val,sub_vecACC, svSz);
  
        		  //Apply UDWT to window
              modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		  //Store UDWT coefficients
              #pragma acc loop
              for (int j=0; j < wsvSz; j++) {
  	    	      w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	          }
               
              //ACF and rms value computation for each level
              #pragma acc loop private(tfbACC[:tfbSz],ACF_vecACC[:avSz],energy,i)            
              for (int j=0;j < ln+1; j++) {
          		    
      		      energy = 0;
          			
                #pragma acc loop reduction(+:energy)
          		  for (int k=0; k < wszINT; k++) { //Obtain level j's coefs and estimate signal energy		
          			  tfbACC[k] = paramsOUT[k + (j*wszINT)];
          			  energy += pow(tfbACC[k],2); 
          		  }         
          		  energy = sqrt(energy / wszNS);  //signal energy in window "i" and level "j"
                  
          		  if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
          			  rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level 
          		  }
          		  else { 					                                                
                  autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS, ACF_vecACC,avSz); //TODO change parameters
			            powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);		
	                rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range                                   
          		  }
   		        }
            }     				                                  
          //Update block of data
          #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async (q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
          }
          #pragma acc wait //ASYNC wait 
}
  
       	endP = omp_get_wtime();
 	      durationP = endP - startP;
        
        printf("\nAFTER ACC REGION ----------\n");
        printf("Parallel execution time of OpenACC region: %f seconds\n", durationP);                            
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));            

        printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
        printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));	    
        printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
        //printf("line_Cnt is: %d and rms_vecACC size is: %d\n",lineCnt, nWs*(ln+1));         

        //Compare wavelet coefficients----------------
        double mae = 0;
        double sigSum = 0;
        double sigSum2 = 0;
        /*for (int i = 0; i < wvSz; i++) { //Comparison for 1 dimensional vectors
           myfile2 << w_vecACC[i] << "\n"; 
           mae += fabs(fileCoef[i] - w_vecACC[i]);
           sigSum += fabs(fileCoef[i]);
           sigSum2 += fabs(w_vecACC[i]);
        }*/
        for (int j = 0;j<ln+1; j++) {
          for (int i = 0;i<nWs; i++) { //Comparison for 2 dimensional vectors
            myfile2 << rms_vecACC[j][i] << "\n"; 
            mae += fabs(fileCoef[j*nWs + i] -  rms_vecACC[j][i]);
            sigSum += fabs(fileCoef[j*nWs + i]);
            sigSum2 += fabs(rms_vecACC[j][i]);
          }
        }
        
        sigSum = sigSum / lineCnt;
        sigSum2 = sigSum2 / lineCnt;
        mae = mae / lineCnt;
        printf("MAE: %f\n", mae);
        printf("Original SigSum: %f\n", sigSum);
        printf("Obtained SigSum: %f\n", sigSum2);
        //--------------------------------------------
        myfile2.close();
        
        //Free used memory in ACC cycle
        delete [] fileCoef;// */
        
        //TODO delete rms structure
        for (int i = 0;i < ln+1; i++){
          delete [] rms_vecACC[i];
        }
        delete [] rms_vecACC;
        delete [] tfbACC;
        delete [] inter_vec;
        delete [] sub_vecACC;
        delete [] paramsOUT; 
        delete [] cA;
        delete [] cD;
        delete [] filt;
        delete [] w_vecACC;
      
        wt_free(wt);
               
        //---------------------------------------------------------------------------
        //---------------------------------------------------------------------------

        delete[] x;

    }
    double gtEnd = omp_get_wtime(); //Global time
    double gtDur = gtEnd - gtBeg;
    printf("Global execution time: %f seconds\n", gtDur);                            
       
    return 0;
}

Looks to be a problem with autocorrvACC calling meanvACC. If I change meanvACC to be a “seq” routine, then the MAE is zero.

It’s legal OpenACC to have meanvACC be vector, but we may have a compiler issue due to the reduction. I’ll get it reported to engineering.

-Mat

Hi Mat,

Thank you for all your help, I’m almost finished with this parallelization task, but now I have another bug, which I suspect can be a compiler issue. I wrap all my code in my Github: AMCM/pgi at master · j-castro/AMCM · GitHub to be easier to compile for you. So just download all the sources and files and compile

make pgimexa2

and then run the executable

./pgimexa2

(You may need to recompile RAFAT’s library (libwavelib2.a)).

So, if I disable OpenACC directives it works perfectly and the MAE is equal to zero. If I enable OpenACC directives I have a execution problem:

 ./pgimexa2

<<<Using default file: aud/9manatees.wav>>>

Parallel execution time of OpenACC region #1: 0.229746 seconds
Parallel execution time of OpenACC region #2: 0.069345 seconds
call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution

In the third and last stage of the program:

//3- Wavelet thresholding ----------------------------------------------------------------------------------------------------------
#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len],rms_vecACC[:ln+1][:nWs]), copyout(y[:xSz]), present(w_vecACC) //**ASYNC COPYIN**          
{    	
    for (int q = 0; q < nBlocks; q++) { //For each block of windows
      int begI = q * bSz * wszINT; //block beggining
      int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],temp_vecACC3[:svSz],filt[:2*lpd_len],tempX[len_X])        
      for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain    
	    
        val1 = i*(ln+1);           //added to avoid a pgi compiler's bug
        val2 = (i+wszINT)*(ln+1)-1; //added to avoid a pgi compiler's bug
        
  	    //3.1 Get UDWT coefficients
   		  getSubvACCpgi(w_vecACC, wvSz,val1,val2,paramsOUT, wsvSz);       
  		
        val1 = ln*wszINT;
        val2 = ln*wszINT + wszINT-1;
   	    
         //3.2 Compute Universal threshold
        getSubvACCpgi(paramsOUT, wsvSz,val1,val2,sub_vecACC, svSz); //Obtain highest level coefficients                  	       
        univ = madvACC(sub_vecACC, svSz, temp_vecACC3) * sqrt(2*log(wszNS)); // 0.6745 decide if using this factor
  		
  	    //3.3 Apply soft thresholding on wavelet coefficients
  	    for (int j=0; j < ln+1; j++) { //for each level
  		
  		    //get all wavelet coef. from level j
          #pragma acc loop 
  		    for (int k=0; k < wszINT; k++)
  		      sub_vecACC[k] = paramsOUT[k + (j*wszINT)];
  				
  		    //Determine threshold to apply
  		    if (rms_vecACC[j][i/wszINT] == 0) {//if belongs to class 0
  		      multv2ACC(sub_vecACC,svSz,0); //set all coefficients to zero (infinite threshold)
  		    }
  		    else {
            softThresvACC(sub_vecACC,svSz,univ); //apply soft thresholding rule using universal threshold
  		    }
  
  		    //Copyback all thresholded wavelet coef. from level j to sub_vector
          #pragma acc loop    
  		    for (int k=0; k < wszINT; k++)
  		      paramsOUT[k + (j*wszINT)] = sub_vecACC[k];
  
  	    }		
     
  	    //3.4 Apply Inverse UDWT
        #pragma acc loop   
        for (int j=0; j < svSz; j++) { //Copy first level coefficients to sub_vecACC (Initialization for imodwtACC)
  		    sub_vecACC[j] = paramsOUT[j];
  	    }
  	    
        imodwtACC(len_X,level_N, lpd,lpd_len, hpd, paramsOUT, filt, tempX, sub_vecACC); //Inverse UDWT		
  	    
  	    //concatenate result to output vector
        #pragma acc loop   
  	    for (int j=0; j< svSz;j++) {
  		    y[i+j] = sub_vecACC[j];
  	    }
  	    
  	    lastIdx = i + svSz;
      }
	  }
}
	//-----------------------------------------------------------------------------------------------------------------------------------------------
 #pragma acc exit data delete(w_vecACC)

So is there a compiler bug or I am missing something???

By the way, if I compile it for multicore it works correctly:

./pgimexa2

<<<Using default file: aud/9manatees.wav>>>

Parallel execution time of OpenACC region #1: 0.370766 seconds
Parallel execution time of OpenACC region #2: 0.078671 seconds
Parallel execution time of OpenACC region #3: 0.296613 seconds
MAE: 0.000000
Original SigSum: 0.004336
Obtained SigSum: 0.004336
Global execution time: 2.152441 seconds

I was able to recreate the illegal address error in 17.10 but it seems to go away in 18.1. 18.1’s MAE is a bit off but this might be an issue with the program since if I use a vector_length==1, then the MAE is zero. I’m a bit swamped right now, but I’ll try to find some time to track down why using vector parallelism the code gets wrong answers.

On a side note, I ran your code through valgrind and noticed that you have an out-of-bounds read access in vec.cpp at lines 1225 and 1127. The value of “i” can be zero here so accessing “f.v[i-1]” is off the beginning of the array. I highly doubt it’s causing the MAE wrong answers, but I thought I would point it out.

                         x0 = f.v[i-1];  // line 1225
                        x1 = f.v[i];
                        y0 = g.v[i-1];
                        y1 = g.v[i];
                        y.v[a] = y0 + (((x.v[a]- x0)*y1 -(x.v[a] -x0)*y0)/(x1-x0));//Formula de interpolacion lineal

-Mat

Thank you Mat, I solved the issue with the MAE a bit off by adjusting the value of variable “lastIdx”, so I already update pgimexa.2.cpp on the Github with this change. Also thank you for your note on vec.cpp! I hope you can help me ass soon as you can by tracking the illegal address error but of course I understand you’re busy.

Hey Mat, did you have a chance to look at the execution error?

I did but wasn’t able to determine the root cause. Though since the error no longer occurs in 18.1, I assume it was some compiler issue that has since been fixed.

Were you able to try using 18.1?

-Mat

I don’t have access to compiler 18.1 since the latest community edition version is 17.1. Is there any way to have at least a trial version of 18.1 to test it?