Hi,
I modified a function from the wavelib library GitHub - rafat/wavelib: C Implementation of 1D and 2D Wavelet Transforms (DWT,SWT and MODWT) along with 1D Wavelet packet Transform and 1D Continuous Wavelet Transform. to parallelize the 1D Maximal Overlap Discrete Wavelet Transform (modwt) using OpenACC directives (I named it modwtACC2()).
The parallelized method works fine when I compile the code for a multicore (I’m using a Quadcore Intel(R) Xeon(R) CPU E3-1225 v5). Also it works fine if I compile it for a tesla k40c but only if I use a vector_length or num_workers of 32 or less, otherwise it produces wrong results (incorrect wavelet coefficients). However, there is no problem with the num_gangs value, it can be increased without producing wrong values, but as long as the vector_length and num_workers product is 32 or less, otherwise it produces wrong values again. The code region I’m parallelizing is embarrassingly parallel and the error is also consistent between multiple executions with the same parameters.
I’m using:
compiler: pgc++ 17.4-0 64-bit target on x86-64 Linux -tp haswell
CUDA Version 9.0.176
Accelerator: Tesla k40c
For simplicity and to reproduce the minimal error I pasted almost all code in a single source file called mexa.cpp:
/*
* Test file for modwt
*/
#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>
using namespace std;
// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] > val) {
val = in[i];
index = i;
}
}
return in[index];
}
// return the value of the minimum value of the double * in
double minvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] < val) {
val = in[i];
index = i;
}
}
return in[index];
}
// Return the mean of double array "in"
double meanvACC(double* in, int sz) {
double sum = 0;
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
// Return the mean of int array "in"
double meanvACCint(int* in, int sz) {
double sum = 0;
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){
if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
for(int i = i1; i <= i2; i++){
y[i-i1] = x[i];
}
}
}
/*OpenACC version of modwt() from wavelib library */
#pragma acc routine seq
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
int i;
int J;
int temp_len;
int iter;
int M;
int lenacc;
temp_len = len_X;
J = level_N;
M = 1;
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = inp[i];
}
lenacc = (J + 1) * temp_len; //levLens[J + 1];
for (iter = 0; iter < J; iter++) {
lenacc = lenacc - temp_len;
if (iter > 0) {
M = 2 * M;
}
int l;
int i2;
int t;
double s;
s = sqrt(2.0);
for (i2 = 0; i2 < lpd_len; i2++) {
filt[i2] = double(lpd[i2] / s);
filt[lpd_len + i2] = double(hpd[i2] / s);
}
for (i2 = 0; i2 < temp_len; i2++) {
t = i2;
cA[i2] = filt[0] * paramsOUT[t]; //TODO RESTORE!
cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
for (l = 1; l < lpd_len; l++) {
t = t - M;
while (t >= temp_len) {
t = t - temp_len;
}
while (t < 0) {
t = t + temp_len;
}
cA[i2] = cA[i2] + filt[l] * paramsOUT[t];
cD[i2] = cD[i2] + filt[lpd_len + l] * paramsOUT[t];
}
}
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = cA[i];
paramsOUT[lenacc + i] = cD[i];
}
}
}
int main (int argc, char *argv[]) {
//Variables***************************************************
double * x; //noisy signal
int xSz; //size of variable x
double fs; //sample frequency (Hz)
double fc = 2000; //cut-off frequency (Hz)
double wsz = 3; //window size (milliseconds)
int ln = 4; //number of levels
double startP, endP, durationP; //time
//***********************************************************
//Open a WAV---------------------------------------------------------------------
FILE* wavf;
SF_INFO info;
char* fileName;
if (argc == 2) {
fileName = argv[1];
printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
}
else {
fileName = "aud/9manatees.wav";
printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
}
wavf = fopen(fileName, "r");
//-------------------------------------------------------------------------------
if(wavf != NULL){ //Succesful wav read
SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);
fs = info.samplerate;
xSz = info.frames;
x = new double[xSz];
sf_read_double(source, x, xSz);
sf_close(source);
fclose(wavf);
//wavelet call----------------------------------------------------------------
//----------------------------------------------------------------------------
printf("\nExecuting parallel wavelet... \n\n");
//Process constants-------------------
wave_object family = wave_init("db8"); //wavelet family
double wszNS = llround(wsz*0.001*fs); //window size in # samples
int wszINT = wszNS;
//--------------------------------------
//Variables-----------------------------------------------------------
double * inter_vec = new double[xSz]; //intermediate output vector
double dt = 1. / fs; //time step in seconds
double lastIdx =0; //last index of processed portion of the signal
wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
//DEBUG - Print Process' constants ---------------------------------
printf("\nProcess constants ----------------- \n");
printf("Signal length: %d\n", xSz);
printf("wszNS: %g samples\n", wszNS);
printf("----------------------------------- \n\n");
//--------------------------------------------------------------------
//Ask for all memory first
//Use double pointers instead of structs of type vec
double * restrict sub_vecACC = new double[wszINT]; //a temporal sub-vector of the entire signal
double * restrict inter_vecACC = x; //a pointer to the original input
double * restrict w_vecACC = new double[xSz*(ln+1)]; //UDWT of the entire signal
for (int yy = 0; yy < xSz*(ln+1); yy++)
w_vecACC[yy] = 0;
int svSz = wszNS;
int wvSz = xSz*(ln+1);
int ivSz = xSz;
//Input parameters of modwtACC2:---------------------------------
int len_X = wt->siglength; //Length of input signal X
int level_N = wt->J; //Number of levels
int len_filt = wt->wave->filtlength;//Wave-filter length
int * levLens = wt->length; //Length of each level
double * lpd = wt->wave->lpd; //Low Pass Details
int lpd_len = wt->wave->lpd_len; //lpd length
double * hpd = wt->wave->hpd; //High Pass Details
//Dynamic memory inputs: allocate and free memory outside ACC-loop
double * cA = new double[len_X]; //free(cA);
double * cD = new double[len_X]; //free(cD);
double * filt = new double[2 * lpd_len]; //free(filt)
//Output parameters:
int outLength = wszINT*(ln+1);
double * paramsOUT = new double[outLength];
//----------------------------------------------------------------
printf("\nBEFORE ACC REGION ----------\n");
printf("lpd length: %d\n", lpd_len);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
//Set levLens before cycle-------------------------
levLens[level_N +1]= (level_N +1)*len_X;
for (int i = 0; i < level_N+1; i++) {
levLens[i] = len_X;
}
//--------------------------------------------------
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
//----------------------------------------------------------------------------
//DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
ifstream myFile; //DEBUG CODE
myFile.open ("xc.txt"); //DEBUG CODE
int lineCnt = 0;
string line;
while (getline(myFile,line))
++lineCnt;
myFile.clear();
myFile.seekg(0,ios::beg);
//Read expected coefficients
double * fileCoef = new double[lineCnt];
for (int i=0; i < lineCnt; i++) {
myFile >> fileCoef[i]; //DEBUG CODE
}
myFile.close(); //DEBUG CODE
//----------------------------------------------------------------------------------
// DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
ofstream myfile2;
myfile2.open ("mexa.txt"); //DEBUG CODE
startP = omp_get_wtime();
//----------------------------------------------------------------------------------
//OpenACC region
#pragma acc data copyin(inter_vecACC[:ivSz],hpd[:lpd_len],lpd[:lpd_len]), copyout(w_vecACC[:wvSz]), create(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len])
{
#pragma acc parallel loop private(sub_vecACC[:svSz],paramsOUT[:outLength],cA[:len_X],cD[:len_X],filt[:2*lpd_len]), num_gangs(512), num_workers(1), vector_length(32)
//2.1 Loop for computing the ACF-rms value in the wavelet domain
for (int i = 0 ;i < int(ivSz - wszNS); i = i + int(wszNS)) {
//Get subvector (window)
getSubvACC(inter_vecACC, ivSz,i,i + wszNS -1,sub_vecACC, svSz);
//Apply UDWT to window
modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
//Store UDWT coefficients
#pragma acc loop
for (int j=0; j < outLength; j++) {
w_vecACC[i*(ln+1) + j] = paramsOUT[j];
}
}
}
endP = omp_get_wtime();
durationP = endP - startP;
printf("\nAFTER ACC REGION ----------\n");
printf("Parallel execution time: %f seconds\n", durationP);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));
printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
//Compare wavelet coefficients----------------
double mae = 0;
double sigSum = 0;
double sigSum2 = 0;
for (int i = 0; i < lineCnt; i++) {
myfile2 << w_vecACC[i] << "\n";
mae += fabs(fileCoef[i] - w_vecACC[i]);
sigSum += fabs(fileCoef[i]);
sigSum2 += fabs(w_vecACC[i]);
}
sigSum = sigSum / lineCnt;
sigSum2 = sigSum2 / lineCnt;
mae = mae / lineCnt;
printf("MAE: %f\n", mae);
printf("Original SigSum: %f\n", sigSum);
printf("Obtained SigSum: %f\n", sigSum2);
//--------------------------------------------
myfile2.close();
//Free used memory in ACC cycle
delete [] fileCoef;
delete [] sub_vecACC;
delete [] paramsOUT;
delete [] cA;
delete [] cD;
delete [] filt;
delete [] w_vecACC;
wt_free(wt);
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
delete[] x;
}
return 0;
}
In this code I’m also reading the original wavelet coefficients from a file and writing the obtained coefficients in another file. The reference coefficients can be obtained executing the program sequentially or in parallel but following the restrictions in the number of workers and size of the vector I said before. So, I really want to know why this error is happening, because even if I change the pragma to a “kernel loop independent” to let the compiler decides the size, it chooses 128 vector-length which also produces erroneous wavelet coefficients. By the way, I’m compiling the program with this line:
pgc++ -lsndfile -lfftw3 -fast -ta=tesla -Minfo=accel -g mexa.cpp -o testACC libwavelib.a
where libwavelib2.a correspond to the built static library from Rafat’s code.