Parallelizing stationary wavelet transform using OpenACC

Hi,

I modified a function from the wavelib library GitHub - rafat/wavelib: C Implementation of 1D and 2D Wavelet Transforms (DWT,SWT and MODWT) along with 1D Wavelet packet Transform and 1D Continuous Wavelet Transform. to parallelize the 1D Maximal Overlap Discrete Wavelet Transform (modwt) using OpenACC directives (I named it modwtACC2()).
The parallelized method works fine when I compile the code for a multicore (I’m using a Quadcore Intel(R) Xeon(R) CPU E3-1225 v5). Also it works fine if I compile it for a tesla k40c but only if I use a vector_length or num_workers of 32 or less, otherwise it produces wrong results (incorrect wavelet coefficients). However, there is no problem with the num_gangs value, it can be increased without producing wrong values, but as long as the vector_length and num_workers product is 32 or less, otherwise it produces wrong values again. The code region I’m parallelizing is embarrassingly parallel and the error is also consistent between multiple executions with the same parameters.

I’m using:

compiler: pgc++ 17.4-0 64-bit target on x86-64 Linux -tp haswell

CUDA Version 9.0.176

Accelerator: Tesla k40c

For simplicity and to reproduce the minimal error I pasted almost all code in a single source file called mexa.cpp:

/*
 * Test file for modwt
*/

#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>

using namespace std;

// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] > val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// return the value of the minimum value of the double * in
double minvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] < val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// Return the mean of double array "in"
double meanvACC(double* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

// Return the mean of int array "in"
double meanvACCint(int* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

    if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check

        for(int i = i1; i <= i2; i++){
            y[i-i1] = x[i];
        }
    }
}

/*OpenACC version of modwt() from wavelib library */
#pragma acc routine seq
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
	
  int i;
  int J;
  int temp_len;
  int iter;
  int M;
  int lenacc;

	temp_len = len_X;
	J = level_N;
	M = 1;

	for (i = 0; i < temp_len; i++) {
		paramsOUT[i] = inp[i];
	}

  lenacc = (J + 1) * temp_len; //levLens[J + 1];
	for (iter = 0; iter < J; iter++) {
		lenacc = lenacc - temp_len;
		if (iter > 0) {
			M = 2 * M;
		}
    int l;
    int i2;
    int t;
    double s;

    s = sqrt(2.0);
	  for (i2 = 0; i2 < lpd_len; i2++) {
		  filt[i2] = double(lpd[i2] / s);
		  filt[lpd_len + i2] = double(hpd[i2]  / s);
    }

	  for (i2 = 0; i2 < temp_len; i2++) {
		  t = i2;
		  cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
		  cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
      for (l = 1; l < lpd_len; l++) {
			  t = t - M;
			  while (t >= temp_len) {
				  t = t - temp_len;
			  }
			  while (t < 0) {
				  t = t + temp_len;
			  }

			  cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
			  cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

      }
    }

    for (i = 0; i < temp_len; i++) {
		    paramsOUT[i] = cA[i];
		    paramsOUT[lenacc + i] = cD[i];
		}
	}

}

int main (int argc, char *argv[]) {

   //Variables***************************************************
    double * x;                   	//noisy signal
    int xSz;                  //size of variable x
    double fs;                 	//sample frequency (Hz)
    double fc = 2000;          	//cut-off frequency (Hz)
    double wsz = 3;             //window size (milliseconds)
    int ln = 4;               	//number of levels        
    double startP, endP, durationP; //time 
    //***********************************************************
    

    //Open a WAV---------------------------------------------------------------------
    FILE* wavf;
    SF_INFO info;
    char* fileName;
    
    if (argc == 2) {
        fileName = argv[1];
	printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
    }
    else {
        fileName = "aud/9manatees.wav";
        printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
    }

    wavf = fopen(fileName, "r");

    //-------------------------------------------------------------------------------
    if(wavf != NULL){   //Succesful wav read

        SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);	

        fs = info.samplerate;
        xSz = info.frames;
        x = new double[xSz];
        sf_read_double(source, x, xSz);
        sf_close(source);

	      fclose(wavf);

        //wavelet call----------------------------------------------------------------
        //----------------------------------------------------------------------------
        
        printf("\nExecuting parallel wavelet... \n\n");		
	      //Process constants-------------------
        wave_object family = wave_init("db8");  //wavelet family
	      double wszNS = llround(wsz*0.001*fs);  	//window size in # samples
        int wszINT = wszNS;
        //--------------------------------------
        
       	//Variables-----------------------------------------------------------
     		double * inter_vec = new double[xSz];	//intermediate output vector
        double dt = 1. / fs;		//time step in seconds
      	double lastIdx =0;		//last index of processed portion of the signal
      	wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
      
      	//DEBUG - Print Process' constants ---------------------------------
      	printf("\nProcess constants ----------------- \n");
        printf("Signal length: %d\n", xSz);
      	printf("wszNS: %g samples\n", wszNS);
      	printf("----------------------------------- \n\n");
      	//--------------------------------------------------------------------	     	            
       
      	//Ask for all memory first
      
      	//Use double pointers instead of structs of type vec
      	double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal      
      	double * restrict inter_vecACC = x; //a pointer to the original input
        double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal
        
        for (int yy = 0; yy < xSz*(ln+1); yy++)
           w_vecACC[yy] = 0;
         	
      	int svSz = wszNS;
      	int wvSz = xSz*(ln+1);
       	int ivSz = xSz;

        //Input parameters of modwtACC2:---------------------------------
        int len_X = wt->siglength;          //Length of input signal X
        int level_N = wt->J;                //Number of levels
        int len_filt = wt->wave->filtlength;//Wave-filter length     
        int * levLens = wt->length;         //Length of each level
        double * lpd = wt->wave->lpd;       //Low Pass Details   
        int lpd_len = wt->wave->lpd_len;    //lpd length     
        double * hpd = wt->wave->hpd;       //High Pass Details   
        //Dynamic memory inputs: allocate and free memory outside ACC-loop 
        double * cA = new double[len_X]; //free(cA);
        double * cD = new double[len_X]; //free(cD);
        double * filt = new double[2 * lpd_len]; //free(filt)
        //Output parameters:
        int outLength = wszINT*(ln+1);
        double * paramsOUT = new double[outLength];        


        //----------------------------------------------------------------
        printf("\nBEFORE ACC REGION ----------\n");
        printf("lpd length: %d\n", lpd_len);
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        
        //Set levLens before cycle-------------------------
        levLens[level_N +1]= (level_N +1)*len_X;
        for (int i = 0; i < level_N+1; i++) {
            levLens[i] = len_X;
        }   
        //--------------------------------------------------       
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));        
        //----------------------------------------------------------------------------

        //DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
        ifstream myFile; //DEBUG CODE
	      myFile.open ("xc.txt"); //DEBUG CODE
        int lineCnt = 0;
        string line;
        while (getline(myFile,line))
            ++lineCnt;
        myFile.clear();
        myFile.seekg(0,ios::beg);

        //Read expected coefficients
        double * fileCoef = new double[lineCnt];
        for (int i=0; i <  lineCnt; i++) {
             myFile >>  fileCoef[i]; //DEBUG CODE
        }
	      myFile.close(); //DEBUG CODE

        //----------------------------------------------------------------------------------
        // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
        	ofstream myfile2;
          myfile2.open ("mexa.txt"); //DEBUG CODE
        startP = omp_get_wtime();  
        //----------------------------------------------------------------------------------

//OpenACC region	
#pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), copyout(w_vecACC[:wvSz]), create(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len])
      	{	    
#pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]), num_gangs(512), num_workers(1), vector_length(32)
          //2.1 Loop for computing the ACF-rms value in the wavelet domain
          for (int i = 0 ;i <  int(ivSz - wszNS); i = i + int(wszNS)) {
      		
            //Get subvector	(window)	
        		getSubvACC(inter_vecACC, ivSz,i,i + wszNS -1,sub_vecACC, svSz);
  
        		//Apply UDWT to window
            modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT); 
                            
        		//Store UDWT coefficients
            #pragma acc loop
            for (int j=0; j < outLength; j++) {	
  	    	    w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	        }
      				
     	    }
        }
       	endP = omp_get_wtime();
 	      durationP = endP - startP;
        
        printf("\nAFTER ACC REGION ----------\n");
        printf("Parallel execution time: %f seconds\n", durationP);                            
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));            

        printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
        printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));	    
        printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
        
        //Compare wavelet coefficients----------------
        double mae = 0;
        double sigSum = 0;
        double sigSum2 = 0;
        for (int i = 0; i < lineCnt; i++) {
           myfile2 << w_vecACC[i] << "\n"; 
           mae += fabs(fileCoef[i] - w_vecACC[i]);
           sigSum += fabs(fileCoef[i]);
           sigSum2 += fabs(w_vecACC[i]);
        }
        sigSum = sigSum / lineCnt;
        sigSum2 = sigSum2 / lineCnt;
        mae = mae / lineCnt;
        printf("MAE: %f\n", mae);
        printf("Original SigSum: %f\n", sigSum);
        printf("Obtained SigSum: %f\n", sigSum2);
        //--------------------------------------------
        myfile2.close();
        
        //Free used memory in ACC cycle
        delete [] fileCoef;
        delete [] sub_vecACC;
        delete [] paramsOUT; 
        delete [] cA;
        delete [] cD;
        delete [] filt;
        delete [] w_vecACC;
      
        wt_free(wt);
               
        //---------------------------------------------------------------------------
        //---------------------------------------------------------------------------

        delete[] x;

    }

    return 0;

}

In this code I’m also reading the original wavelet coefficients from a file and writing the obtained coefficients in another file. The reference coefficients can be obtained executing the program sequentially or in parallel but following the restrictions in the number of workers and size of the vector I said before. So, I really want to know why this error is happening, because even if I change the pragma to a “kernel loop independent” to let the compiler decides the size, it chooses 128 vector-length which also produces erroneous wavelet coefficients. By the way, I’m compiling the program with this line:

pgc++ -lsndfile -lfftw3 -fast -ta=tesla -Minfo=accel -g mexa.cpp -o testACC libwavelib.a

where libwavelib2.a correspond to the built static library from Rafat’s code.

Hi jcastro9999,

I’m trying to reproduce the error but keep getting a runtime when the code calls “wt_init”.

To get the code to build, I needed to comment out “butter.h” and the git repository you have only builds a “libwavelib.a” file. Not sure how to get the “libwavelib2.a” library.

Finally, I’m just using a generic “.wav” file so don’t know if it’s the correct format for your program.

Here’s an example of the error. It’s the same if I use pgc++ or g++ without OpenACC.

 Error - The Signal Can only be iterated -2147483648 times using this wavelet. Exiting

Once I can get this error resolved, I’ll take a look at the wrong answers with OpenACC when using more than 32 vectors.

-Mat

Thank you Mat!

Yes please comment out butter.h and use libwavelib.a.

(Previously, I used butter.h to apply a high-pass filter before processing the signal but I ignore this part in this minimal example because it is not the cause of the error. I also previously modified some functions in the wavelib library to use them with OpenACC directives, but here those functions are already present in the source mexa.cpp, so don’t worry I’ve already compiled this code using the original library: libwavelib.a and the results are the same)

Maybe it is important to mention how I compiled the wavelib library, I create the following Makefile:

#Variables--------------------------------------------------------|
SRCS = *.c
OBJS = $(SRCS:.c=.o)
CC = g++				#C++ compiler
CFLAGS = -c -fopenmp			#Compiler flags
LFLAGS = -lsndfile -fopenmp -lfftw3	#linker flags
#-----------------------------------------------------------------|

#Build using g++--------------------------------------------------|
wavelib: $(SRCS)
	$(CC) $(CFLAGS) $(SRCS)  
#-----------------------------------------------------------------|

#Build using OpenACC compiler and target-specific variable values-|
wavelibACC: CC = pgc++
wavelibACC: CFLAGS = -c -fast -ta=tesla -Minfo=accel
wavelibACC: LFLAGS = -lsndfile -lfftw3 -ta=tesla

wavelibACC: $(SRCS)
	$(CC) $(CFLAGS) $(SRCS)
#-----------------------------------------------------------------|
staticlib:
	ar rcs libwavelib.a *.o
#-----------------------------------------------------------------|

clean: 
	rm -f *.o *.gch

And compile it with pgc++ by using:

make wavelibACC
make staticlib

Then I use the produced libwavelib.a. Finally the wav I use it is a small recording (about 19 seconds) of manatee vocalizations, sampled at 96 kHz, but any wav file not too small or too large should work. So maybe that error is because the audio file is too long, but I’m only guessing.

Still no luck. I’ve tried a variety of .wav files including the samples generated by libsndfile’s example program so don’t think the problem is there.

I’ll try again tomorrow and dig into wavelib’s init routine to see what the error is about.

I believe I know why are you having the error, it is because you are using
wav samples at a different sample frequency, try using wavs sampled at 96 kHz, for example, the chirp tones available in this web page:

The problem is that the method requires a minimum window size in samples (wszNS) in order to calculate all desired levels (ln) of the wavelet transform (modwt()), and wszNS depends on sample frequency (fs).

Thanks! That file worked.

While I’m not sure why the illegal memory address error occurs, I was able to find a good work around by making the routine calls “vector”. This is something you’d want to do anyway since it’s significantly faster that having them run sequentially.

Note that I’m getting NaNs for MAE and SigSum but these occur with the CPU version as well so I ignored them.

% cat mexa.2.cpp
/*
  * Test file for modwt
 */

 #include <omp.h>
 #include <sndfile.h>
// #include <iostream>
 #include "wavelib.h" //wavelib RAFAT - C wavelet library
 #include <fstream>

 using namespace std;

 // return the value of the maximum value of the double * in
 double maxvvACC(double* in, int sz) {
     int i;
     int index = 0;
     double val = in[0];
     for (i = 1; i < sz; i++) {
         if (in[i] > val) {
             val = in[i];
             index = i;
         }
     }
     return in[index];
 }

 // return the value of the minimum value of the double * in
 double minvvACC(double* in, int sz) {
     int i;
     int index = 0;
     double val = in[0];
     for (i = 1; i < sz; i++) {
         if (in[i] < val) {
             val = in[i];
             index = i;
         }
     }
     return in[index];
 }

 // Return the mean of double array "in"
 double meanvACC(double* in, int sz) {

     double sum = 0;
     for(int i = 0;i < sz; i++) {
    sum = sum + in[i];
     }

     return sum / sz;
 }

 // Return the mean of int array "in"
 double meanvACCint(int* in, int sz) {

     double sum = 0;
     for(int i = 0;i < sz; i++) {
    sum = sum + in[i];
     }

     return sum / sz;
 }

 //Get a sub-vector of array "x" from index "i1" to "i2"
 //and store it in array "y"
#pragma acc routine vector
 void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

     if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
#pragma acc loop vector
         for(int i = i1; i <= i2; i++){
             y[i-i1] = x[i];
         }
     }
 }

 /*OpenACC version of modwt() from wavelib library */
 #pragma acc routine vector
 void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {

   int i;
   int J;
   int temp_len;
   int iter;
   int M;
   int lenacc;

    temp_len = len_X;
    J = level_N;
    M = 1;

#pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
       paramsOUT[i] = inp[i];
    }

   lenacc = (J + 1) * temp_len; //levLens[J + 1];
    for (iter = 0; iter < J; iter++) {
       lenacc = lenacc - temp_len;
       if (iter > 0) {
          M = 2 * M;
       }
     int l;
     int i2;
     int t;
     double s;

     s = sqrt(2.0);
#pragma acc loop vector
      for (i2 = 0; i2 < lpd_len; i2++) {
         filt[i2] = double(lpd[i2] / s);
         filt[lpd_len + i2] = double(hpd[i2]  / s);
     }

#pragma acc loop vector
      for (i2 = 0; i2 < temp_len; i2++) {
         t = i2;
         cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
         cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
       for (l = 1; l < lpd_len; l++) {
            t = t - M;
            while (t >= temp_len) {
               t = t - temp_len;
            }
            while (t < 0) {
               t = t + temp_len;
            }

            cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
            cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

       }
     }

#pragma acc loop vector
     for (i = 0; i < temp_len; i++) {
           paramsOUT[i] = cA[i];
           paramsOUT[lenacc + i] = cD[i];
       }
    }

 }

 int main (int argc, char *argv[]) {

    //Variables***************************************************
     double * x;                      //noisy signal
     int xSz;                  //size of variable x
     double fs;                    //sample frequency (Hz)
     double fc = 2000;             //cut-off frequency (Hz)
     double wsz = 3;             //window size (milliseconds)
     int ln = 4;                  //number of levels
     double startP, endP, durationP; //time
     //***********************************************************


     //Open a WAV---------------------------------------------------------------------
     FILE* wavf;
     SF_INFO info;
     char* fileName;

     if (argc == 2) {
         fileName = argv[1];
    printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
     }
     else {
         fileName = "aud/9manatees.wav";
         printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
     }

     wavf = fopen(fileName, "r");

     //-------------------------------------------------------------------------------
     if(wavf != NULL){   //Succesful wav read

         SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);

         fs = info.samplerate;
         xSz = info.frames;
         x = new double[xSz];
         sf_read_double(source, x, xSz);
         sf_close(source);

          fclose(wavf);

         //wavelet call----------------------------------------------------------------
         //----------------------------------------------------------------------------

         printf("\nExecuting parallel wavelet... \n\n");
          //Process constants-------------------
         wave_object family = wave_init("db8");  //wavelet family
          double wszNS = llround(wsz*0.001*fs);     //window size in # samples
         int wszINT = wszNS;
         //--------------------------------------

           //Variables-----------------------------------------------------------
           double * inter_vec = new double[xSz];   //intermediate output vector
         double dt = 1. / fs;      //time step in seconds
          double lastIdx =0;      //last index of processed portion of the signal
          wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object

          //DEBUG - Print Process' constants ---------------------------------
          printf("\nProcess constants ----------------- \n");
          printf("Signal length: %d\n", xSz);
          printf("wszNS: %g samples\n", wszNS);
          printf("----------------------------------- \n\n");
          //--------------------------------------------------------------------

          //Ask for all memory first

          //Use double pointers instead of structs of type vec
          double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal
          double * restrict inter_vecACC = x; //a pointer to the original input
         double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal

         for (int yy = 0; yy < xSz*(ln+1); yy++)
            w_vecACC[yy] = 0;

          int svSz = wszNS;
          int wvSz = xSz*(ln+1);
           int ivSz = xSz;

         //Input parameters of modwtACC2:---------------------------------
        int len_X = wt->siglength;          //Length of input signal X
         int level_N = wt->J;                //Number of levels
         int len_filt = wt->wave->filtlength;//Wave-filter length
         int * levLens = wt->length;         //Length of each level
         double * lpd = wt->wave->lpd;       //Low Pass Details
         int lpd_len = wt->wave->lpd_len;    //lpd length
         double * hpd = wt->wave->hpd;       //High Pass Details
         //Dynamic memory inputs: allocate and free memory outside ACC-loop
         double * cA = new double[len_X]; //free(cA);
         double * cD = new double[len_X]; //free(cD);
         double * filt = new double[2 * lpd_len]; //free(filt)
         //Output parameters:
         int outLength = wszINT*(ln+1);
         double * paramsOUT = new double[outLength];


         //----------------------------------------------------------------
         printf("\nBEFORE ACC REGION ----------\n");
         printf("lpd length: %d\n", lpd_len);
         printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
         printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
         printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
         printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));

         //Set levLens before cycle-------------------------
         levLens[level_N +1]= (level_N +1)*len_X;
         for (int i = 0; i < level_N+1; i++) {
             levLens[i] = len_X;
         }
         //--------------------------------------------------
         printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
         //----------------------------------------------------------------------------

         //DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
         ifstream myFile; //DEBUG CODE
          myFile.open ("xc.txt"); //DEBUG CODE
         int lineCnt = 0;
         string line;
         while (getline(myFile,line))
             ++lineCnt;
         myFile.clear();
         myFile.seekg(0,ios::beg);

         //Read expected coefficients
         double * fileCoef = new double[lineCnt];
         for (int i=0; i <  lineCnt; i++) {
              myFile >>  fileCoef[i]; //DEBUG CODE
         }
          myFile.close(); //DEBUG CODE

         //----------------------------------------------------------------------------------
         // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
            ofstream myfile2;
           myfile2.open ("mexa.txt"); //DEBUG CODE
         startP = omp_get_wtime();
         //----------------------------------------------------------------------------------

         printf("Sizes: svSzr=%d outLength=%d len_X=%d lpd_len=%d\n",svSz,outLength,len_X,lpd_len);
         printf("sub_vecACC=%ld paramsOUT=%ld cA=%ld cD=%ld filt=%ld\n", svSz*8,outLength*8,len_X*8,len_X*8,2*lpd_len*8);

 //OpenACC region
 #pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), copyout(w_vecACC[:wvSz]), create(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len])
          {
 #pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) 
           //2.1 Loop for computing the ACF-rms value in the wavelet domain
           for (int i = 0 ;i <  int(ivSz - wszNS); i = i + int(wszNS)) {

             //Get subvector   (window)
             getSubvACC(inter_vecACC, ivSz,i,i + wszNS -1,sub_vecACC, svSz);

             //Apply UDWT to window
             modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);

               //Store UDWT coefficients
             #pragma acc loop
             for (int j=0; j < outLength; j++) {
                 w_vecACC[i*(ln+1) + j] = paramsOUT[j];
               }

             }
         }
           endP = omp_get_wtime();
           durationP = endP - startP;

         printf("\nAFTER ACC REGION ----------\n");
         printf("Parallel execution time: %f seconds\n", durationP);
         printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
         printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
         printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
         printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
         printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));

         printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
         printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));
         printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));

         //Compare wavelet coefficients----------------
         double mae = 0;
         double sigSum = 0;
         double sigSum2 = 0;
         for (int i = 0; i < lineCnt; i++) {
            myfile2 << w_vecACC[i] << "\n";
            mae += fabs(fileCoef[i] - w_vecACC[i]);
            sigSum += fabs(fileCoef[i]);
            sigSum2 += fabs(w_vecACC[i]);
         }
         sigSum = sigSum / lineCnt;
         sigSum2 = sigSum2 / lineCnt;
         mae = mae / lineCnt;
         printf("MAE: %f\n", mae);
         printf("Original SigSum: %f\n", sigSum);
         printf("Obtained SigSum: %f\n", sigSum2);
         //--------------------------------------------
         myfile2.close();

         //Free used memory in ACC cycle
         delete [] fileCoef;
         delete [] sub_vecACC;
         delete [] paramsOUT;
         delete [] cA;
         delete [] cD;
         delete [] filt;
         delete [] w_vecACC;

         wt_free(wt);

         //---------------------------------------------------------------------------
         //---------------------------------------------------------------------------

         delete[] x;

     }

     return 0;

 }
% pgc++ -fast mexa.2.cpp -o testACC -I /local/home/colgrove/wavelib/libsndfile/include/ -L/local/home/colgrove/wavelib/libsndfile/lib -I../../src -L/proj/pgi/linux86-64/2017/fftw/fftw-3.3.5/openmpi-2.1.2/lib -w -lsndfile -lfftw3 libwavelib.a -ta=tesla:cc60 -Minfo=accel
mexa.2.cpp:
getSubvACC(double *, int, int, int, double *, int):
     66, Generating Tesla code
         70, #pragma acc loop vector /* threadIdx.x */
     70, Loop is parallelizable
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *, double *, double *):
     78, Generating Tesla code
         92, #pragma acc loop vector /* threadIdx.x */
         97, #pragma acc loop seq
        109, #pragma acc loop vector /* threadIdx.x */
        115, #pragma acc loop vector /* threadIdx.x */
        119, #pragma acc loop seq
        121, #pragma acc loop seq
        124, #pragma acc loop seq
        135, #pragma acc loop vector /* threadIdx.x */
     92, Loop is parallelizable
     97, Loop carried scalar dependence for M at line 100,120,78
    109, Loop is parallelizable
    115, Loop is parallelizable
    119, Loop carried scalar dependence for t at line 120,78
         Complex loop carried dependence of paramsOUT->,filt->,cA->,cD-> prevents parallelization
         Loop carried reuse of cD->,cA-> prevents parallelization
    122, Accelerator restriction: induction variable live-out from loop: t
    123, Accelerator restriction: induction variable live-out from loop: t
    126, Accelerator restriction: induction variable live-out from loop: t
    135, Loop is parallelizable
main:
    285, Generating copyin(hpd[:lpd_len])
         Generating create(filt[:lpd_len*2])
         Generating copyin(inter_vecACC[:ivSz])
         Generating copyout(w_vecACC[:wvSz])
         Generating create(sub_vecACC[:svSz],paramsOUT[:outLength])
         Generating copyin(lpd[:lpd_len])
         Generating create(cA[:len_X],cD[:len_X])
         Accelerator kernel generated
         Generating Tesla code
        288, #pragma acc loop gang /* blockIdx.x */
        298, #pragma acc loop vector(32) /* threadIdx.x */
    298, Loop is parallelizable
% testACC audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav                                                   
<<<Opening file: audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
-----------------------------------


BEFORE ACC REGION ----------
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
Sizes: svSzr=288 outLength=1440 len_X=288 lpd_len=16
sub_vecACC=2304 paramsOUT=11520 cA=2304 cD=2304 filt=256

AFTER ACC REGION ----------
Parallel execution time: 0.651188 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
MAE: -nan
Original SigSum: -nan
Obtained SigSum: -nan

Great! I got exactly the same values:

w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041

Well I think that vectorizing modwtACC() and getsubvACC it’s a good idea, but, the problem arises when I use a vector_length or number of workers bigger than 32 in the main loop, for example:

#pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]), num_gangs(1), num_workers(1), vector_length(128)
          //2.1 Loop for computing the ACF-rms value in the wavelet domain
          for (int i = 0 ;i <  int(ivSz - wszNS); i = i + int(wszNS)) {
      		
            //Get subvector	(window)	
        		getSubvACC(inter_vecACC, ivSz,i,i + wszNS -1,sub_vecACC, svSz);
  
        		//Apply UDWT to window
            modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		//Store UDWT coefficients
            #pragma acc loop
            for (int j=0; j < outLength; j++) {	
  	    	    w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	        }
      				
     	    }
        }

In fact, if I do not vectorize modwtACC() and getsubvACC(), the default behavior (if I don’t specify the number of gangs, workers, and vector length) is to make the vector length equal to 128. The problem is that this produces very different results from the previous ones. Therefore, if I set the number of workers or vector length major than 32 I get erroneous results. You can check all of this by first saving the original coefficients (the w_vecACC vector values when using less than 32 workers or vector length) in a file with the code I already provided before ACC region (just change the file name of myfile2 to xc.txt):

// DEBUG - WRITE WAVELET COEFFICIENTS----------------------------------------------- 
  ofstream myfile2; 
  myfile2.open ("xc.txt"); //DEBUG CODE 
  startP = omp_get_wtime();  
//----------------------------------------------------------------------------------

and after ACC region:

for (int i = 0; i < lineCnt; i++) { 
           myfile2 << w_vecACC[i] << "\n"; 
}

And then, re-run the code with a number of workers or vector length major than 32 and compare it with the previous results by changing myfile and myfile2 names:


//DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
        ifstream myFile; //DEBUG CODE
	      myFile.open ("xc.txt"); //DEBUG CODE
        int lineCnt = 0;
        string line;
        while (getline(myFile,line))
            ++lineCnt;
        myFile.clear();
        myFile.seekg(0,ios::beg);

        double * fileCoef = new double[lineCnt];
        for (int i=0; i <  lineCnt; i++) {
             myFile >>  fileCoef[i]; //DEBUG CODE
        }
	      myFile.close(); //DEBUG CODE
        
        //----------------------------------------------------------------------------------
        // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
        	ofstream myfile2;
          myfile2.open ("xc2.txt"); //DEBUG CODE
          startP = omp_get_wtime();  
        //----------------------------------------------------------------------------------

All the code block with title: “READ ORIGINAL WAVELET COEFFICIENTS” should be commented the first time you save the coefficients in a file and also the lines in the For loop after the ACC region that use the coefficients red from myfile (fileCoef vector):


        //Compare wavelet coefficients----------------
        //double mae = 0;
        //double sigSum = 0;
        double sigSum2 = 0;
        for (int i = 0; i < wvSz; i++) {
           myfile2 << w_vecACC[i] << "\n"; 
           //mae += fabs(fileCoef[i] - w_vecACC[i]);
          // sigSum += fabs(fileCoef[i]);
           sigSum2 += fabs(w_vecACC[i]);
        }
        //sigSum = sigSum / lineCnt;
        //sigSum2 = sigSum2 / lineCnt;
        //mae = mae / lineCnt;
        //printf("MAE: %f\n", mae);
       // printf("Original SigSum: %f\n", sigSum);
        //printf("Obtained SigSum: %f\n", sigSum2);
        //--------------------------------------------
        myfile2.close();
        
        //Free used memory in ACC cycle
        //delete [] fileCoef;

Those commented lines compare the median absolute error (MAE) of the first coefficients (run with workers and vector_length minor than 32) and the last ones (run with workers or vector_length major than 32). This MAE value should be zero but it is not!

Hi,

I apply your modifications to my code and the behavior of the error change a little bit:

1- Now in the openACC region the default vector_length is 32 so it works fine, but If I try to make larger the vector_length, the compiler ignores that and set the vector_length to 32 instead.

2- Again if I try to use worker’s parallelism and the product of num_workers x vector_length is major than 32, erroneous results are obtained in w_vecACC

Hope you can check this out, thank you in advance!

1- Now in the openACC region the default vector_length is 32 so it works fine, but If I try to make larger the vector_length, the compiler ignores that and set the vector_length to 32 instead.

Correct, this is the expected behavior. By default, the compiler only uses 32 vectors when there’s a call to a vector routine. Otherwise we have to add extra syncthread calls in the routine which often slow the code down.

There’s a hidden option, “-ta=tesla:gvmode” which will reverse this and use 128 vectors. When I try this, the code does slow down as expected but still gets correct answers.

2- Again if I try to use worker’s parallelism and the product of num_workers x vector_length is major than 32, erroneous results are obtained in w_vecACC

Where are you putting “worker”? Just at the loop or did you change the routines to be “worker” instead of “vector”? And if you did use worker routines, which loop did you apply to worker and which to vector?

Ok thank you for the reply, I still have this two problems:

1 - You said:

There’s a hidden option, “-ta=tesla:gvmode” which will reverse this and use 128 vectors. When I try this, the code does slow down as expected but still gets correct answers

That’s correct, but in my case, the code speeds up a little bit instead. However, the real question here is: If I eliminate the vector directives you added in modwtACC2() and getSubvACC(), Why are erroneous results obtained when using a vector_length > 32?

2 - You said:

Where are you putting “worker”? Just at the loop or did you change the routines to be “worker” instead of “vector”? And if you did use worker routines, which loop did you apply to worker and which to vector?

I’m just putting for example “num_gangs(256)”, “num_workers(32)”, and “vector_length(32)” in the main loop (without specifying any of the openACC directives you added in modwtACC2() and getSubvACC() or other directive):

.
.
.
#pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) num_gangs(512), num_workers(32), vector_length(32)
          //2.1 Loop for computing the ACF-rms value in the wavelet domain
          for (int i = 0 ;i <  int(ivSz - wszNS); i = i + int(wszNS)) {
      		
            //Get subvector	(window)	
        		getSubvACC(inter_vecACC, ivSz,i,i + wszNS -1,sub_vecACC, svSz);
  
        		//Apply UDWT to window
            modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		//Store UDWT coefficients
            #pragma acc loop
            for (int j=0; j < outLength; j++) {	
  	    	    w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	        }
      				
     	    }
        }
.
.
.

So, Why erroneous results are obtained when I do not vectorize getSubvACC() and modwtACC2()?, or in other words: Why I obtain wrong results when running getSubvACC() and modwtACC2() sequentially?

Why I obtain wrong results when running getSubvACC() and modwtACC2() sequentially?

Looks to be a problem with inlining. Compiling with either “-O2” or “-fast -Mnoinline -Mnoautoinline” works around the issue. Performance is much slower than the vector version, but does get correct answers.

Thanks!! But does it really get correct answers ??? I added -O2 and also try with -O1 and still get erroneous coefficients. My compiling line is:

pgc++ -lsndfile -fast -O2 -ta=tesla  -Minfo=accel -g mexa.cpp -o mexaACC libwavelib.a

I’m using your original code without the num_gangs and vector_length settings. In this case, it gets correct answers with just -O2. Which version of the source are you using?

% pgc++ mexa.1.cpp -o testACC -I /local/home/colgrove/wavelib/libsndfile/include/ -L/local/home/colgrove/wavelib/libsndfile/lib -I../../src -L//local/home/colgrove/wavelib/fftw/lib -w -lsndfile -lfftw3 libwavelib.a -O2 -ta=tesla:cc60 -Minfo=accel -V17.4
mexa.1.cpp:
getSubvACC(double *, int, int, int, double *, int):
     65, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *, double *, double *):
     77, Generating acc routine seq
         Generating Tesla code
main:
    277, Generating copyin(hpd[:lpd_len])
         Generating create(filt[:lpd_len*2])
         Generating copyin(inter_vecACC[:ivSz])
         Generating copyout(w_vecACC[:wvSz])
         Generating create(sub_vecACC[:svSz],paramsOUT[:outLength])
         Generating copyin(lpd[:lpd_len])
         Generating create(cA[:len_X],cD[:len_X])
         Accelerator kernel generated
         Generating Tesla code
        280, #pragma acc loop gang /* blockIdx.x */
        290, #pragma acc loop vector(128) /* threadIdx.x */
    290, Loop is parallelizable
hsw8:/local/home/colgrove/wavelib/src% ./testACC /scratch/colgrove/audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav                                  
<<<Opening file: /scratch/colgrove/audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
-----------------------------------


BEFORE ACC REGION ----------
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000

AFTER ACC REGION ----------
Parallel execution time: 0.811515 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
MAE: -nan
Original SigSum: -nan
Obtained SigSum: -nan

I’m using the original version of the code too, using -O2 and this is my output:

pgc++ -lsndfile -fast -O2 -ta=tesla  -Minfo=accel pgimexa.cpp -o pgimexa libwavelib.a
pgimexa.cpp:
getSubvACC(double *, int, int, int, double *, int):
     64, Generating implicit acc routine seq
         Generating acc routine seq
         Generating Tesla code
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *,  double *, double *):
     76, Generating acc routine seq
         Generating Tesla code
main:
    274, Generating copyin(hpd[:lpd_len])
         Generating create(filt[:lpd_len*2])
         Generating copyin(inter_vecACC[:ivSz])
         Generating copyout(w_vecACC[:wvSz])
         Generating create(sub_vecACC[:svSz],paramsOUT[:outLength])
         Generating copyin(lpd[:lpd_len])
         Generating create(cA[:len_X],cD[:len_X])
         Accelerator kernel generated
         Generating Tesla code
        277, #pragma acc loop gang /* blockIdx.x */
        287, #pragma acc loop vector(128) /* threadIdx.x */
    287, Loop is parallelizable
[jcastro@tule-01 Parallel_Manatee]$ ./pgimexa sweep.wav

<<<Opening file: sweep.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
-----------------------------------


BEFORE ACC REGION ----------
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000

AFTER ACC REGION ----------
Parallel execution time: 2.832630 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.915642
w_vecACC min value is: -0.908779
w_vecACC mean value is: 0.000015
MAE: 0.079692
Original SigSum: 0.110644
Obtained SigSum: 0.030952

The error still there, I don’t know if it’s related to the k40c’s architecture I’m using or something…

It looks like I was just getting lucky on the P100. The code failed on a K80 and when I set the environment variable “PGI_ACC_FILL=1” which initialized device data to zero, the code failed consistently on the P100 as well.

I did more digging and determined that the problem is a missing syncthreads call in the generated CUDA kernel. The code correctly has all but thread 0 jump around the function calls, but since there’s not syncthread call, the remain threads in the other 3 warps start executing the vector loop before thread 0 has updated the arrays. I added a problem report and sent it to engineering (TPR#24991).

Now my original suggestion of using vector routines instead of sequential is still your best option since it gives a significant performance boost.

Thanks,
Mat

Ok thank you Mat! I was starting to get crazy with that problem I’ve been facing for months.

Now, I followed your initial advice of vectorizing the functions: getSubvACC() and modwtACC2(), and everything worked but now I’m trying to run the code asynchronously to overlap data transfer and computation time, so I grouped the temporal windows into blocks to process those blocks asynchronously. The problem is that just for splitting the windows into blocks and parallelize it with the same directives I have this runtime error:

call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution

This is the code with the modifications to process the signal using blocks of 32 windows (I’m still not using any async clause):


/*
 * Test file for modwt
*/

#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>

using namespace std;

// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] > val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] < val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// Return the mean of double array "in"
double meanvACC(double* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

// Return the mean of int array "in"
double meanvACCint(int* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

    if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
        #pragma acc loop vector
        for(int i = i1; i <= i2; i++){
            y[i-i1] = x[i];
        }
    }
}


#pragma acc routine vector//seq
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
	
  int i;
  int J;
  int temp_len;
  int iter;
  int M;
  int lenacc;

	temp_len = len_X;
	J = level_N;
	M = 1;

#pragma acc loop vector 
	for (i = 0; i < temp_len; i++) {
		paramsOUT[i] = inp[i];
	}

  lenacc = (J + 1) * temp_len; //levLens[J + 1];
	for (iter = 0; iter < J; iter++) {
		lenacc = lenacc - temp_len;
		if (iter > 0) {
			M = 2 * M;
		}


    int l;
    int i2;
    int t;
    double s;

    s = sqrt(2.0);
    #pragma acc loop vector
	  for (i2 = 0; i2 < lpd_len; i2++) {
		  filt[i2] = double(lpd[i2] / s);
		  filt[lpd_len + i2] = double(hpd[i2]  / s);
    }

#pragma acc loop vector //check if this is really parallelizable
	  for (i2 = 0; i2 < temp_len; i2++) {
		  t = i2;
		  cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
		  cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
      for (l = 1; l < lpd_len; l++) {
			  t = t - M;
			  while (t >= temp_len) {
				  t = t - temp_len;
			  }
			  while (t < 0) {
				  t = t + temp_len;
			  }

			  cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
			  cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

      }
    }

#pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
		    paramsOUT[i] = cA[i];
		    paramsOUT[lenacc + i] = cD[i];
		}
	}

}


int main (int argc, char *argv[]) {

   //Variables***************************************************
    double * x;                   	//noisy signal
    int xSz;                  //size of variable x
    double fs;                 	//sample frequency (Hz)
    double fc = 2000;          	//cut-off frequency (Hz)
    double wsz = 3;             //window size (milliseconds)
    int ln = 4;               	//number of levels        
    double startP, endP, durationP; //time 
    //***********************************************************
    

    //Open a WAV---------------------------------------------------------------------
    FILE* wavf;
    SF_INFO info;
    char* fileName;
    
    if (argc == 2) {
        fileName = argv[1];
	printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
    }
    else {
        fileName = "aud/9manatees.wav";
        printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
    }

    wavf = fopen(fileName, "r");

    //-------------------------------------------------------------------------------
    if(wavf != NULL){   //Succesful wav read

        SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);	

        fs = info.samplerate;
        xSz = info.frames;
        x = new double[xSz];
        sf_read_double(source, x, xSz);
        sf_close(source);

	      fclose(wavf);

        //wavelet call----------------------------------------------------------------
        //----------------------------------------------------------------------------
        
        printf("\nExecuting parallel wavelet... \n\n");		
	      //Process constants-------------------
        wave_object family = wave_init("db8");  //wavelet family
	      double wszNS = llround(wsz*0.001*fs);  	//window size in # samples
        int wszINT = wszNS;
        //--------------------------------------
        
       	//Variables-----------------------------------------------------------
     		double * inter_vec = new double[xSz];	//intermediate output vector
        double dt = 1. / fs;		//time step in seconds
      	double lastIdx =0;		//last index of processed portion of the signal
      	wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
      
      	//DEBUG - Print Process' constants ---------------------------------
      	printf("\nProcess constants ----------------- \n");
        printf("Signal length: %d\n", xSz);
      	printf("wszNS: %g samples\n", wszNS);
      	printf("----------------------------------- \n\n");
      	//--------------------------------------------------------------------	
      	

      	//Ask for all memory first
      
      	//Use double pointers instead of structs of type vec
      	double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal      
      	double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
        double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal
        for (int yy = 0; yy < xSz*(ln+1); yy++)
         w_vecACC[yy] = 0;
         	
      	int svSz = wszNS;
      	int wvSz = xSz*(ln+1);
       	int ivSz = xSz;

        //Input parameters of modwtACC2:---------------------------------
        int len_X = wt->siglength;          //Length of input signal X
        int level_N = wt->J;                //Number of levels
        int len_filt = wt->wave->filtlength;//Wave-filter length     
        int * levLens = wt->length;         //Length of each level
        double * lpd = wt->wave->lpd;       //Low Pass Details   
        int lpd_len = wt->wave->lpd_len;    //lpd length     
        double * hpd = wt->wave->hpd;       //High Pass Details   
        //Dynamic memory inputs: allocate and free memory outside ACC-loop 
        double * cA = new double[len_X]; //free(cA);
        double * cD = new double[len_X]; //free(cD);
        double * filt = new double[2 * lpd_len]; //free(filt)
        //Output parameters:
        int outLength = wszINT*(ln+1);
        double * paramsOUT = new double[outLength];        


        //----------------------------------------------------------------
        printf("\nBEFORE ACC REGION ----------\n");
        printf("lpd length: %d\n", lpd_len);
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        
        //Set levLens before cycle-------------------------
        levLens[level_N +1]= (level_N +1)*len_X;
        for (int i = 0; i < level_N+1; i++) {
            levLens[i] = len_X;
        }   
        //--------------------------------------------------       
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));        
        //----------------------------------------------------------------------------

        int bSz = 32; //block size in number of windows
        int nBlocks = ceil( floor(ivSz/wszINT) / bSz );//number of blocks 
        printf("Number of blocks: %d\n",nBlocks);
        printf("w_vecACC size: %d\n", xSz*(ln+1));
        printf("w_vecACC update operation's size %d\n", (ivSz - wszINT)*(ln+1));
        startP = omp_get_wtime();

	//TODO: Make an asynchronous execution of the OpenACC region
  //1 - Divide computation in blocks
  //2 - Divide signal into blocks (Use update directive)
  //3 - Use Asynchronous directive
  //4 - Make initial copyof inter_vecACC asynchronous too
          
#pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), create(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len],w_vecACC[:wvSz])
{
          for (int q = 0; q < nBlocks-1; q++) { //For each block of windows
            int begI = q * bSz * wszINT; //block beggining
            int endI = min(begI + (bSz * wszINT),ivSz - wszINT); //block end

#pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) //async() num_gangs(512),num_workers(1),vector_length(32) 
            for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
      		
              //Get subvector	(window)	
        		  getSubvACC(inter_vecACC, ivSz,i,i + wszINT -1,sub_vecACC, svSz);
  
        		  //Apply UDWT to window
              modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		  //Store UDWT coefficients
              #pragma acc loop
              for (int j=0; j < outLength; j++) {
  	    	      w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	          }    				
     	      }              
                     
          //Update block of data            
          #pragma acc update self(w_vecACC[begI*(ln+1):endI*(ln+1)]) //async()
          }
}
          //#pragma acc wait   
       	endP = omp_get_wtime();
 	      durationP = endP - startP;
        
        printf("\nAFTER ACC REGION ----------\n");
        printf("Parallel execution time: %f seconds\n", durationP);                            
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));            

        printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
        printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));	    
        printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
                
        delete [] inter_vec;
        delete [] sub_vecACC;
        delete [] paramsOUT; 
        delete [] cA;
        delete [] cD;
        delete [] filt;
        delete [] w_vecACC;
      
        wt_free(wt);
               
        //---------------------------------------------------------------------------
        //---------------------------------------------------------------------------

        delete[] x;

    }

    return 0;

}

And the compilation line is the same as always:

pgc++ -lsndfile -fast -ta=tesla -Minfo=accel pgimexa.2.cpp -o pgimexa libwavelib2.a

[/code]

Looks like another compiler error when passing in the “I+wszINT-1” value to the subroutine. I was able to work around it by creating a temp variable, computing the value, and then passing in the temp instead. Note that this error does not occur in our development compiler so it looks like we’ve already fix the problem, though I don’t see a specific report about it. One of our compiler engineers could have found it on their own.

I also noticed that your “update” directive is incorrect. In C/C++ the data shape is “arr[start:number of elements]”. Not the range, i.e. “arr[start:end]”. I updated it with what I think is the correct number of elements to copy, but please double check me.

Finally, there’s no need to put the private arrays in the outer data region so I took them out.

Here’s the full source with the changes:

% cat mexa.3.cpp
/*
  * Test file for modwt
 */

 #include <omp.h>
 #include <sndfile.h>
 #include <iostream>
 #include "wavelib.h" //wavelib RAFAT - C wavelet library
 #include <fstream>

 using namespace std;

 // return the value of the maximum value of the double * in
 double maxvvACC(double* in, int sz) {
     int i;
     int index = 0;
     double val = in[0];
     for (i = 1; i < sz; i++) {
         if (in[i] > val) {
             val = in[i];
             index = i;
         }
     }
     return in[index];
 }

 // return the value of the manimum value of the double * in
 double minvvACC(double* in, int sz) {
     int i;
     int index = 0;
     double val = in[0];
     for (i = 1; i < sz; i++) {
         if (in[i] < val) {
             val = in[i];
             index = i;
         }
     }
     return in[index];
 }

 // Return the mean of double array "in"
 double meanvACC(double* in, int sz) {

     double sum = 0;
     for(int i = 0;i < sz; i++) {
    sum = sum + in[i];
     }

     return sum / sz;
 }

 // Return the mean of int array "in"
 double meanvACCint(int* in, int sz) {

     double sum = 0;
     for(int i = 0;i < sz; i++) {
    sum = sum + in[i];
     }

     return sum / sz;
 }

 //Get a sub-vector of array "x" from index "i1" to "i2"
 //and store it in array "y"
 #pragma acc routine vector
 void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

     if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
         #pragma acc loop vector
         for(int i = i1; i <= i2; i++){
             y[i-i1] = x[i];
         }
     }
 }


 #pragma acc routine vector//seq
 void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {

   int i;
   int J;
   int temp_len;
   int iter;
   int M;
   int lenacc;

    temp_len = len_X;
    J = level_N;
    M = 1;

 #pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
       paramsOUT[i] = inp[i];
    }

   lenacc = (J + 1) * temp_len; //levLens[J + 1];
    for (iter = 0; iter < J; iter++) {
       lenacc = lenacc - temp_len;
       if (iter > 0) {
          M = 2 * M;
       }


     int l;
     int i2;
     int t;
     double s;

     s = sqrt(2.0);
     #pragma acc loop vector
      for (i2 = 0; i2 < lpd_len; i2++) {
         filt[i2] = double(lpd[i2] / s);
         filt[lpd_len + i2] = double(hpd[i2]  / s);
     }

 #pragma acc loop vector //check if this is really parallelizable
      for (i2 = 0; i2 < temp_len; i2++) {
         t = i2;
         cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
         cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
       for (l = 1; l < lpd_len; l++) {
            t = t - M;
            while (t >= temp_len) {
               t = t - temp_len;
            }
            while (t < 0) {
               t = t + temp_len;
            }

            cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
            cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

       }
     }

 #pragma acc loop vector
     for (i = 0; i < temp_len; i++) {
           paramsOUT[i] = cA[i];
           paramsOUT[lenacc + i] = cD[i];
       }
    }

 }


 int main (int argc, char *argv[]) {

    //Variables***************************************************
     double * x;                      //noisy signal
     int xSz;                  //size of variable x
     double fs;                    //sample frequency (Hz)
     double fc = 2000;             //cut-off frequency (Hz)
     double wsz = 3;             //window size (milliseconds)
     int ln = 4;                  //number of levels
     double startP, endP, durationP; //time
     //***********************************************************


     //Open a WAV---------------------------------------------------------------------
     FILE* wavf;
     SF_INFO info;
     char* fileName;

     if (argc == 2) {
         fileName = argv[1];
    printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
     }
     else {
         fileName = "aud/9manatees.wav";
         printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
     }

     wavf = fopen(fileName, "r");

     //-------------------------------------------------------------------------------
     if(wavf != NULL){   //Succesful wav read

         SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);

         fs = info.samplerate;
         xSz = info.frames;
         x = new double[xSz];
         sf_read_double(source, x, xSz);
         sf_close(source);

          fclose(wavf);

         //wavelet call----------------------------------------------------------------
         //----------------------------------------------------------------------------

         printf("\nExecuting parallel wavelet... \n\n");
          //Process constants-------------------
         wave_object family = wave_init("db8");  //wavelet family
          double wszNS = llround(wsz*0.001*fs);     //window size in # samples
         int wszINT = wszNS;
         //--------------------------------------

           //Variables-----------------------------------------------------------
           double * inter_vec = new double[xSz];   //intermediate output vector
         double dt = 1. / fs;      //time step in seconds
          double lastIdx =0;      //last index of processed portion of the signal
          wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object

          //DEBUG - Print Process' constants ---------------------------------
         printf("\nProcess constants ----------------- \n");
         printf("Signal length: %d\n", xSz);
          printf("wszNS: %g samples\n", wszNS);
          printf("----------------------------------- \n\n");
          //--------------------------------------------------------------------


          //Ask for all memory first

          //Use double pointers instead of structs of type vec
          double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal
          double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
         double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal
         for (int yy = 0; yy < xSz*(ln+1); yy++)
          w_vecACC[yy] = 0;

          int svSz = wszNS;
          int wvSz = xSz*(ln+1);
           int ivSz = xSz;

         //Input parameters of modwtACC2:---------------------------------
        int len_X = wt->siglength;          //Length of input signal X
         int level_N = wt->J;                //Number of levels
         int len_filt = wt->wave->filtlength;//Wave-filter length
         int * levLens = wt->length;         //Length of each level
         double * lpd = wt->wave->lpd;       //Low Pass Details
         int lpd_len = wt->wave->lpd_len;    //lpd length
         double * hpd = wt->wave->hpd;       //High Pass Details
         //Dynamic memory inputs: allocate and free memory outside ACC-loop
         double * cA = new double[len_X]; //free(cA);
         double * cD = new double[len_X]; //free(cD);
         double * filt = new double[2 * lpd_len]; //free(filt)
         //Output parameters:
         int outLength = wszINT*(ln+1);
         double * paramsOUT = new double[outLength];


         //----------------------------------------------------------------
         printf("\nBEFORE ACC REGION ----------\n");
         printf("lpd length: %d\n", lpd_len);
         printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
         printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
         printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
         printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));

         //Set levLens before cycle-------------------------
         levLens[level_N +1]= (level_N +1)*len_X;
         for (int i = 0; i < level_N+1; i++) {
             levLens[i] = len_X;
         }
         //--------------------------------------------------
         printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
         //----------------------------------------------------------------------------

         int bSz = 32; //block size in number of windows
         int nBlocks = ceil( floor(ivSz/wszINT) / bSz );//number of blocks
         printf("Number of blocks: %d\n",nBlocks);
         printf("w_vecACC size: %d\n", xSz*(ln+1));
         printf("w_vecACC update operation's size %d\n", (ivSz - wszINT)*(ln+1));
         startP = omp_get_wtime();

    //TODO: Make an asynchronous execution of the OpenACC region
   //1 - Divide computation in blocks
   //2 - Divide signal into blocks (Use update directive)
   //3 - Use Asynchronous directive
   //4 - Make initial copyof inter_vecACC asynchronous too
#pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
 {
           for (int q = 0; q < nBlocks-1; q++) { //For each block of windows
             int begI = q * bSz * wszINT; //block beggining
             int endI = min(begI + (bSz * wszINT),ivSz - wszINT); //block end

#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len])  //async() num_gangs(512),num_workers(1),vector_length(32)
             for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain

               //Get subvector   (window)
#ifdef ORG
               getSubvACC(inter_vecACC, ivSz,i,i + wszINT -1,sub_vecACC, svSz);
#else
               int val = i + wszINT -1;
               getSubvACC(inter_vecACC, ivSz,i,val,sub_vecACC, svSz);
#endif
                 //Apply UDWT to window
               modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);

                 //Store UDWT coefficients
               #pragma acc loop
               for (int j=0; j < outLength; j++) {
                   w_vecACC[i*(ln+1) + j] = paramsOUT[j];
                 }
               }

           //Update block of data
           #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) //async()
           //#pragma acc update self(w_vecACC[begI*(ln+1):endI*(ln+1)]) //async()
           }
 }
           //#pragma acc wait
           endP = omp_get_wtime();
           durationP = endP - startP;

         printf("\nAFTER ACC REGION ----------\n");
         printf("Parallel execution time: %f seconds\n", durationP);
         printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
         printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
         printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
         printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
         printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));

         printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
         printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));
         printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));

         delete [] inter_vec;
         delete [] sub_vecACC;
         delete [] paramsOUT;
         delete [] cA;
         delete [] cD;
         delete [] filt;
         delete [] w_vecACC;

         wt_free(wt);

         //---------------------------------------------------------------------------
         //---------------------------------------------------------------------------

         delete[] x;

     }

     return 0;

 }

% pgc++ mexa.3.cpp -o testACC -I /local/home/colgrove/wavelib/libsndfile/include/ -L/local/home/colgrove/wavelib/libsndfile/lib -I../../src -L/local/home/colgrove/wavelib/fftw/lib -w -lsndfile -lfftw3 libwavelib.a -Minfo=accel -ta=tesla:cc60 -fast -V17.4
mexa.3.cpp:
getSubvACC(double *, int, int, int, double *, int):
     66, Generating Tesla code
         70, #pragma acc loop vector /* threadIdx.x */
     70, Loop is parallelizable
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *, double *, double *):
     78, Generating Tesla code
         92, #pragma acc loop vector /* threadIdx.x */
         97, #pragma acc loop seq
        111, #pragma acc loop vector /* threadIdx.x */
        117, #pragma acc loop vector /* threadIdx.x */
        121, #pragma acc loop seq
        123, #pragma acc loop seq
        126, #pragma acc loop seq
        137, #pragma acc loop vector /* threadIdx.x */
     92, Loop is parallelizable
     97, Loop carried scalar dependence for M at line 100,122,78
    111, Loop is parallelizable
    117, Loop is parallelizable
    121, Loop carried scalar dependence for t at line 122,78
         Complex loop carried dependence of paramsOUT->,filt->,cA->,cD-> prevents parallelization
         Loop carried reuse of cD->,cA-> prevents parallelization
    124, Accelerator restriction: induction variable live-out from loop: t
    125, Accelerator restriction: induction variable live-out from loop: t
    128, Accelerator restriction: induction variable live-out from loop: t
    137, Loop is parallelizable
main:
    272, Generating copyin(hpd[:lpd_len],inter_vecACC[:ivSz])
         Generating create(w_vecACC[:wvSz])
         Generating copyin(lpd[:lpd_len])
    275, Accelerator kernel generated
         Generating Tesla code
        278, #pragma acc loop gang /* blockIdx.x */
        292, #pragma acc loop vector(32) /* threadIdx.x */
    292, Loop is parallelizable
    300, Generating update self(w_vecACC[(ln+1)*begI:(ln+1)*(endI-begI)])
% testACC /scratch/colgrove/audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav                                                            
<<<Opening file: /scratch/colgrove/audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav>>>


Executing parallel wavelet...


Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
-----------------------------------


BEFORE ACC REGION ----------
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
Number of blocks: 313
w_vecACC size: 14400005
w_vecACC update operation's size 14398565

AFTER ACC REGION ----------
Parallel execution time: 0.790170 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000042

-Mat

Thanks Mat! That solved the problem.

Hi Mat,

Thank you, now the program is running asynchronously but I’m having problems with the execution time of the program. Sometimes it consistently runs very fast about 0.2 seconds but other days the same executable in the same computer runs consistently slower (about 2 seconds). I don’t understand why I’m experiencing this variation in the execution time. The code I’m using is this:

/*
 * Test file for modwt
*/

#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>

using namespace std;

// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] > val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
    int i;
    int index = 0;
    double val = in[0];
    for (i = 1; i < sz; i++) {
        if (in[i] < val) {
            val = in[i];
            index = i;
        }
    }
    return in[index];
}

// Return the mean of double array "in"
double meanvACC(double* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

// Return the mean of int array "in"
double meanvACCint(int* in, int sz) {
    
    double sum = 0;
    for(int i = 0;i < sz; i++) {
	sum = sum + in[i];
    }

    return sum / sz;
}

//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){

    if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
        #pragma acc loop vector
        for(int i = i1; i <= i2; i++){
            y[i-i1] = x[i];
        }
    }
}

#pragma acc routine vector//seq
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
	
  int i;
  int J;
  int temp_len;
  int iter;
  int M;
  int lenacc;

	temp_len = len_X;
	J = level_N;
	M = 1;

#pragma acc loop vector 
	for (i = 0; i < temp_len; i++) {
		paramsOUT[i] = inp[i];
	}

  lenacc = (J + 1) * temp_len; //levLens[J + 1];
	for (iter = 0; iter < J; iter++) {
		lenacc = lenacc - temp_len;
		if (iter > 0) {
			M = 2 * M;
		}


    int l;
    int i2;
    int t;
    double s;

    s = sqrt(2.0);
    #pragma acc loop vector
	  for (i2 = 0; i2 < lpd_len; i2++) {
		  filt[i2] = double(lpd[i2] / s);
		  filt[lpd_len + i2] = double(hpd[i2]  / s);
    }

#pragma acc loop vector //check if this is really parallelizable
	  for (i2 = 0; i2 < temp_len; i2++) {
		  t = i2;
		  cA[i2] = filt[0] * paramsOUT[t]; //TODO  RESTORE!
		  cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
      for (l = 1; l < lpd_len; l++) {
			  t = t - M;
			  while (t >= temp_len) {
				  t = t - temp_len;
			  }
			  while (t < 0) {
				  t = t + temp_len;
			  }

			  cA[i2] =  cA[i2] + filt[l] * paramsOUT[t];
			  cD[i2] =  cD[i2] + filt[lpd_len + l] * paramsOUT[t];

      }
    }

#pragma acc loop vector
    for (i = 0; i < temp_len; i++) {
		    paramsOUT[i] = cA[i];
		    paramsOUT[lenacc + i] = cD[i];
		}
	}

}


int main (int argc, char *argv[]) {
    
    double gtBeg = omp_get_wtime(); //Global time
    
   //Variables***************************************************
    double * x;                   	//noisy signal
    int xSz;                  //size of variable x
    double fs;                 	//sample frequency (Hz)
    double fc = 2000;          	//cut-off frequency (Hz)
    double wsz = 3;             //window size (milliseconds)
    int ln = 4;               	//number of levels        
    double startP, endP, durationP; //time 
    //***********************************************************
    

    //Open a WAV---------------------------------------------------------------------
    FILE* wavf;
    SF_INFO info;
    char* fileName;
    
    if (argc == 2) {
        fileName = argv[1];
	printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
    }
    else {
        fileName = "aud/9manatees.wav";
        printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
    }

    wavf = fopen(fileName, "r");

    //-------------------------------------------------------------------------------
    if(wavf != NULL){   //Succesful wav read

        SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);	

        fs = info.samplerate;
        xSz = info.frames;
        x = new double[xSz];
        sf_read_double(source, x, xSz);
        sf_close(source);

	      fclose(wavf);

        //wavelet call----------------------------------------------------------------
        //----------------------------------------------------------------------------
        
        printf("\nExecuting parallel wavelet... \n\n");		
	      //Process constants-------------------
        wave_object family = wave_init("db8");  //wavelet family
	      double wszNS = llround(wsz*0.001*fs);  	//window size in # samples
        int wszINT = wszNS;
        //--------------------------------------
        
       	//Variables-----------------------------------------------------------
     		double * inter_vec = new double[xSz];	//intermediate output vector
        double dt = 1. / fs;		//time step in seconds
      	double lastIdx =0;		//last index of processed portion of the signal
      	wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
      
      	//DEBUG - Print Process' constants ---------------------------------
      	printf("\nProcess constants ----------------- \n");
        printf("Signal length: %d\n", xSz);
      	printf("wszNS: %g samples\n", wszNS);
      	printf("----------------------------------- \n\n");
      	//--------------------------------------------------------------------	
      	

      	//Ask for all memory first
      
      	//Use double pointers instead of structs of type vec
      	double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal      
      	double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
        double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal
        for (int yy = 0; yy < xSz*(ln+1); yy++)
         w_vecACC[yy] = 0;
         	
      	int svSz = wszNS;
      	int wvSz = xSz*(ln+1);
       	int ivSz = xSz;

        //Input parameters of modwtACC2:---------------------------------
        int len_X = wt->siglength;          //Length of input signal X
        int level_N = wt->J;                //Number of levels
        int len_filt = wt->wave->filtlength;//Wave-filter length     
        int * levLens = wt->length;         //Length of each level
        double * lpd = wt->wave->lpd;       //Low Pass Details   
        int lpd_len = wt->wave->lpd_len;    //lpd length     
        double * hpd = wt->wave->hpd;       //High Pass Details   
        //Dynamic memory inputs: allocate and free memory outside ACC-loop 
        double * cA = new double[len_X]; //free(cA);
        double * cD = new double[len_X]; //free(cD);
        double * filt = new double[2 * lpd_len]; //free(filt)
        //Output parameters:
        int outLength = wszINT*(ln+1);
        double * paramsOUT = new double[outLength];        


        //----------------------------------------------------------------
        printf("\nBEFORE ACC REGION ----------\n");
        printf("lpd length: %d\n", lpd_len);
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        
        //Set levLens before cycle-------------------------
        levLens[level_N +1]= (level_N +1)*len_X;
        for (int i = 0; i < level_N+1; i++) {
            levLens[i] = len_X;
        }   
        //--------------------------------------------------       
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));        
        //----------------------------------------------------------------------------

        int bSz = 128; //block size in number of windows
        int nWs = floor(ivSz/wszINT);  //number of windows
        int nBlocks = ceil( floor(ivSz/wszINT) / bSz );//number of blocks 
        int resWsz = nWs%bSz; //residual of the # of windows divided by the block size
        printf("Number of windows: %d\n",nWs);
        printf("Number of blocks: %d\n",nBlocks);
        printf("Residual of windows: %d\n",resWsz);
        printf("w_vecACC size: %d\n", xSz*(ln+1));
        printf("w_vecACC update operation's size %d\n", (ivSz - wszINT)*(ln+1));
        //DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
        /*ifstream myFile; //DEBUG CODE
	      myFile.open ("sweep.txt"); //DEBUG CODE
        int lineCnt = 0;
        string line;
        while (getline(myFile,line))
            ++lineCnt;
        myFile.clear();
        myFile.seekg(0,ios::beg);

        double * fileCoef = new double[lineCnt];
        for (int i=0; i <  lineCnt; i++) {
             myFile >>  fileCoef[i]; //DEBUG CODE
        }
	      myFile.close(); //DEBUG CODE
        
        //----------------------------------------------------------------------------------
        // DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
        	ofstream myfile2;
          myfile2.open ("sweep2.txt"); //DEBUG CODE*/
          startP = omp_get_wtime();
        //----------------------------------------------------------------------------------
	//TODO: Make an asynchronous execution of the OpenACC region
  //1 - Divide computation in blocks
  //2 - Divide signal into blocks (Use update directive)
  //3 - Use Asynchronous directive
  //4 - Make initial copyof inter_vecACC asynchronous too
          
#pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
{
          for (int q = 0; q < nBlocks; q++) { //For each block of windows
            int begI = q * bSz * wszINT; //block beggining
            int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
            /*printf("wszINT: %d\n", wszINT);
            printf("wszNS: %f\n", wszNS);
            printf("begI: %d\n", begI);
            printf("endI: %d\n", endI);
            printf("outLength: %d\n", outLength);*/
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //num_gangs(2056),num_workers(1),vector_length(32) 
            for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
      		    
              int val = i + wszINT -1; //added to avoid a compiler's bug
              //Get subvector	(window)	
        		  getSubvACC(inter_vecACC, ivSz,i,val,sub_vecACC, svSz);
  
        		  //Apply UDWT to window
              modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
                            
        		  //Store UDWT coefficients
              #pragma acc loop
              for (int j=0; j < outLength; j++) {
  	    	      w_vecACC[i*(ln+1) + j] = paramsOUT[j];
   	          }    				
     	      }              
                     
          //Update block of data
          #pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async(q%4)      
          }
}
          #pragma acc wait   
       	endP = omp_get_wtime();
 	      durationP = endP - startP;
        
        printf("\nAFTER ACC REGION ----------\n");
        printf("Parallel execution time of OpenACC region: %f seconds\n", durationP);                            
        printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
        printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
        printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
        printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
        printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));            

        printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
        printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));	    
        printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
        
        //Compare wavelet coefficients----------------
        /*double mae = 0;
        double sigSum = 0;
        double sigSum2 = 0;
        for (int i = 0; i < wvSz; i++) {
           //myfile2 << w_vecACC[i] << "\n"; 
           mae += fabs(fileCoef[i] - w_vecACC[i]);
           sigSum += fabs(fileCoef[i]);
           sigSum2 += fabs(w_vecACC[i]);
        }
        sigSum = sigSum / lineCnt;
        sigSum2 = sigSum2 / lineCnt;
        mae = mae / lineCnt;
        printf("MAE: %f\n", mae);
        printf("Original SigSum: %f\n", sigSum);
        printf("Obtained SigSum: %f\n", sigSum2);
        //--------------------------------------------
        myfile2.close();
        
        //Free used memory in ACC cycle
        delete [] fileCoef;*/
        delete [] inter_vec;
        delete [] sub_vecACC;
        delete [] paramsOUT; 
        delete [] cA;
        delete [] cD;
        delete [] filt;
        delete [] w_vecACC;
      
        wt_free(wt);
               
        //---------------------------------------------------------------------------
        //---------------------------------------------------------------------------

        delete[] x;

    }
    double gtEnd = omp_get_wtime(); //Global time
    double gtDur = gtEnd - gtBeg;
    printf("Global execution time: %f seconds\n", gtDur);                            
       
    return 0;

}

As I said previously I am running this program on a tesla k40c.

It sounds like it’s the device warm-up time. The OS may power down the device when not in use and it can take ~1-2 seconds per device to power it back up. For most programs, this is just noise, but here it can make a huge difference.

Try running the utility “pgcudainit” in the background. This will hold the device open and not let it power down.

-Mat