Thank you Mat, that solved the problem.
Hi Mat,
Now that the wavelet transform is working asynchronously, I added the next step to the parallelized program, which consists of a simple energy detector at each level of the wavelet transform. The sequential version of the program works fine and also the parallelized version works fine as long as I do not use async clause. If I use the async clause with only one queue the program returns consistent wrong results. If I use the async clause with multiple queues the program returns different wrong results each time. This is my code:
/*
* Test file for modwt
*/
#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>
using namespace std;
// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] > val) {
val = in[i];
index = i;
}
}
return in[index];
}
// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] < val) {
val = in[i];
index = i;
}
}
return in[index];
}
// Return the mean of double array "in"
#pragma acc routine vector
double meanvACC(double* in, int sz) {
double sum = 0;
#pragma acc loop reduction(+:sum)
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
// Return the mean of int array "in"
//TODO parallelize
double meanvACCint(int* in, int sz) {
double sum = 0;
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){
if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
#pragma acc loop vector
for(int i = i1; i <= i2; i++){
y[i-i1] = x[i];
}
}
}
#pragma acc routine vector
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
int i;
int J;
int temp_len;
int iter;
int M;
int lenacc;
temp_len = len_X;
J = level_N;
M = 1;
#pragma acc loop vector
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = inp[i];
}
lenacc = (J + 1) * temp_len; //levLens[J + 1];
for (iter = 0; iter < J; iter++) {
lenacc = lenacc - temp_len;
if (iter > 0) {
M = 2 * M;
}
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
int l;
int i2;
int t;
double s;
s = sqrt(2.0);
#pragma acc loop vector
for (i2 = 0; i2 < lpd_len; i2++) {
filt[i2] = double(lpd[i2] / s);
filt[lpd_len + i2] = double(hpd[i2] / s);
}
#pragma acc loop vector //check if this is really parallelizable
for (i2 = 0; i2 < temp_len; i2++) {
t = i2;
cA[i2] = filt[0] * paramsOUT[t]; //TODO RESTORE!
cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
for (l = 1; l < lpd_len; l++) {
t = t - M;
while (t >= temp_len) {
t = t - temp_len;
}
while (t < 0) {
t = t + temp_len;
}
cA[i2] = cA[i2] + filt[l] * paramsOUT[t];
cD[i2] = cD[i2] + filt[lpd_len + l] * paramsOUT[t];
}
}
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
#pragma acc loop vector
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = cA[i];
paramsOUT[lenacc + i] = cD[i];
}
}
}
//Compute the sample autocorrelation of vector "in" from
//first lag "fl" to last lag "ll"
#pragma acc routine vector
void autocorrvACC(double* in, int szIn, int fl, int ll, double * out, int szOut){
if ((ll-fl+1) <= szOut) {
double mean = meanvACC(in,szIn); //Sample mean
double var = 0; //Sample variance
//Compute sample variance
#pragma acc loop reduction(+:var)
for (int i = 0; i < szIn; i++) {
var += (in[i]-mean)*(in[i]-mean);
}
var = var / szIn;
//Compute sample-ACF
#pragma acc loop vector
for(int i = fl;i <= ll; i++) {// Lag loop
out[i-fl] = 0;
for(int j = i;j < szIn; j++) { //Dot product
out[i-fl] += (in[j] - mean) * (in[j-i] - mean);
}
out[i-fl] = out[i-fl] / (var * szIn);
}
}
}
#pragma acc routine vector
void powvACC(double * in, int szIn, int n, double * out, int szOut) {
if (szIn <= szOut) {
#pragma acc loop vector
for(int i = 0;i < szIn; i++) {
out[i] = pow(in[i], n);
}
}
}
int main (int argc, char *argv[]) {
double gtBeg = omp_get_wtime(); //Global time
//Variables***************************************************
double * x; //noisy signal
int xSz; //size of variable x
double fs; //sample frequency (Hz)
double fc = 2000; //cut-off frequency (Hz)
double wsz = 3; //window size (milliseconds)
int ln = 4; //number of levels
double energy; //signal energy computed at each level/window
double startP, endP, durationP; //time
//***********************************************************
//Open a WAV*************************************************
FILE* wavf;
SF_INFO info;
char* fileName;
if (argc == 2) {
fileName = argv[1];
printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
}
else {
fileName = "aud/9manatees.wav";
printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
}
wavf = fopen(fileName, "r");
if(wavf != NULL){ //Succesful wav read
SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);
fs = info.samplerate;
xSz = info.frames;
x = new double[xSz];
sf_read_double(source, x, xSz);
sf_close(source);
fclose(wavf);
//Denoising method begins*****************************************************
printf("\nExecuting parallel wavelet... \n\n");
//Process constants-------------------
wave_object family = wave_init("db8"); //wavelet family
double ZERO_CONST = pow(10,-14); //represents minimum signal energy
double wszNS = llround(wsz*0.001*fs); //window size in # samples
int wszINT = wszNS;
//--------------------------------------
//Initialize Variables-------------------------------------------------
double alagBegNS = round(0.21 *0.001*fs); //autocorrelation lag beggining
double alagEndNS = round(1.25 *0.001*fs); //autoorrelation lag end
double * inter_vec = new double[xSz]; //intermediate output vector
double dt = 1. / fs; //time step in seconds
double lastIdx =0; //last index of processed portion of the signal
wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
//DEBUG - Print basic info -----------------------------------
printf("\nProcess constants ----------------- \n");
printf("Signal length: %d\n", xSz);
printf("wszNS: %g samples\n", wszNS);
printf("Alag in number of smaples: [%f,%f]\n",alagBegNS,alagEndNS);
printf("----------------------------------- \n\n");
//--------------------------------------------------------------------
//Allocate all memory before OpenACC region
//Size of vectors in number of samples
int svSz = wszINT; //size of temporal subvector
int wsvSz = wszINT*(ln+1); //size of wavelet coefficients of the temporal subvector
int wvSz = xSz*(ln+1); //size of wavelet coefficients of the entire signal
int avSz = alagEndNS - alagBegNS + 1; //size of autocorrelation vector
int tfbSz = wszINT;
//Size of blocks and windows
int bSz = 128; //block size in number of windows
int nWs = floor(xSz/wszINT); //number of windows
int nBlocks = ceil( floor(xSz/wszINT) / bSz );//number of blocks
int resWsz = nWs%bSz; //residual of the # of windows divided by the block size
//OpenACC variables-------------------------------------------------------------------------------------
double * restrict sub_vecACC = new double[svSz]; //a temporal subvector of the input signal
double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
double * restrict w_vecACC = new double[wvSz]; //UDWT of the entire signal
double * restrict tfbACC = new double[tfbSz]; //a temporal vector to store one frequency channel of sub-vector sub_vecACC
printf("\nDebug checkpoint #-1\n");
double * restrict ACF_vecACC = new double[avSz] ; //vector that store the autocorrelation function result
printf("\nDebug checkpoint #0\n");
double ** restrict rms_vecACC = new double*[ln+1]; //rms value matrix of ACF at each level of each time window
//------------------------------------------------------------------------------------------------------
printf("\nDebug checkpoint #1\n");
//Initialization of matrix
for(int i = 0; i < ln+1; i++){
rms_vecACC[i] = new double[nWs];
/*for (int j = 0; j < nWs; j++) { Not necessary
rms_vecACC[i][j] = 0;
}*/
}
//Initialization of wavelet coefficient's vector
for (int yy = 0; yy < xSz*(ln+1); yy++){
w_vecACC[yy] = 0;
}
//Input parameters of modwtACC2-------------------------------------
int len_X = wt->siglength; //Length of input signal X
int level_N = wt->J; //Number of levels
int len_filt = wt->wave->filtlength;//Wave-filter length
int * levLens = wt->length; //Length of each level
double * lpd = wt->wave->lpd; //Low Pass Details
int lpd_len = wt->wave->lpd_len; //lpd length
double * hpd = wt->wave->hpd; //High Pass Details
//Dynamic memory inputs: allocate and free memory outside ACC-loop
double * cA = new double[len_X]; //free(cA);
double * cD = new double[len_X]; //free(cD);
double * filt = new double[2 * lpd_len]; //free(filt)
//Output parameters:
double * paramsOUT = new double[wsvSz];
//------------------------------------------------------------------
//----------------------------------------------------------------
printf("\nBEFORE ACC REGION ----------\n");
printf("Number of windows: %d\n",nWs);
printf("Number of blocks: %d\n",nBlocks);
printf("Residual of windows: %d\n",resWsz);
printf("w_vecACC size: %d\n", xSz*(ln+1));
printf("w_vecACC update operation's size %d\n", (xSz - wszINT)*(ln+1));
printf("lpd length: %d\n", lpd_len);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
//Set levLens before cycle-------------------------
levLens[level_N +1]= (level_N +1)*len_X;
for (int i = 0; i < level_N+1; i++) {
levLens[i] = len_X;
}
//--------------------------------------------------
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
//----------------------------------------------------------------------------
//DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
ifstream myFile; //DEBUG CODE
myFile.open ("sweepEner.txt"); //DEBUG CODE
int lineCnt = 0;
string line;
while (getline(myFile,line))
++lineCnt;
myFile.clear();
myFile.seekg(0,ios::beg);
double * fileCoef = new double[lineCnt];
for (int i=0; i < lineCnt; i++) {
myFile >> fileCoef[i]; //DEBUG CODE
}
myFile.close(); //DEBUG CODE
//----------------------------------------------------------------------------------
// DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
ofstream myfile2;
myfile2.open ("sweep2Ener.txt"); //DEBUG CODE*/
startP = omp_get_wtime();
//----------------------------------------------------------------------------------
#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len]), copyout(rms_vecACC[:ln+1][:nWs]), create(w_vecACC[:wvSz],inter_vecACC[:xSz]) //**ASYNC COPYIN**
//#pragma acc data copyin(inter_vecACC[:xSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
{
for (int q = 0; q < nBlocks; q++) { //For each block of windows
int begI = q * bSz * wszINT; //block beggining
int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
/*printf("wszINT: %d\n", wszINT);
printf("wszNS: %f\n", wszNS);
printf("begI: %d\n", begI);
printf("endI: %d\n", endI);
printf("wsvSz: %d\n", wsvSz);*/
#pragma acc update device(inter_vecACC[begI:endI-begI]) async//(q%4) //**ASYNC COPYIN**
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async//(q%4) //**ASYNC COPYIN**
//#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //num_gangs(2056),num_workers(1),vector_length(32)
for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
int val = i + wszINT -1; //added to avoid a pgi compiler's bug
//Get subvector (window)
getSubvACC(inter_vecACC, xSz,i,val,sub_vecACC, svSz);
//Apply UDWT to window
modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
//Store UDWT coefficients
#pragma acc loop
for (int j=0; j < wsvSz; j++) {
w_vecACC[i*(ln+1) + j] = paramsOUT[j];
}
//ACF and rms value computation for each level
#pragma acc loop private(tfbACC[:tfbSz],ACF_vecACC[:avSz],energy,i)
for (int j=0;j < ln+1; j++) {
energy = 0;
#pragma acc loop reduction(+:energy)
for (int k=0; k < wszINT; k++) { //Obtain level j's coefs and estimate signal energy
tfbACC[k] = paramsOUT[k + (j*wszINT)];
energy += pow(tfbACC[k],2);
}
energy = sqrt(energy / wszNS); //signal energy in window "i" and level "j"
rms_vecACC[j][i/wszINT] = energy;
/*if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level
}
else {
autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS, ACF_vecACC,avSz); //TODO change parameters
powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);
rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range
}*/
}
}
//Update block of data
#pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
}
}
#pragma acc wait //ASYNC wait
endP = omp_get_wtime();
durationP = endP - startP;
printf("\nAFTER ACC REGION ----------\n");
printf("Parallel execution time of OpenACC region: %f seconds\n", durationP);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));
printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
printf("line_Cnt is: %d and rms_vecACC size is: %d\n",lineCnt, nWs*(ln+1));
//Compare wavelet coefficients----------------
double mae = 0;
double sigSum = 0;
double sigSum2 = 0;
/*for (int i = 0; i < wvSz; i++) { //Comparison for 1 dimensional vectors
myfile2 << w_vecACC[i] << "\n";
mae += fabs(fileCoef[i] - w_vecACC[i]);
sigSum += fabs(fileCoef[i]);
sigSum2 += fabs(w_vecACC[i]);
}*/
for (int j = 0;j<ln+1; j++) {
for (int i = 0;i<nWs; i++) { //Comparison for 2 dimensional vectors
myfile2 << rms_vecACC[j][i] << "\n";
mae += fabs(fileCoef[j*nWs + i] - rms_vecACC[j][i]);
sigSum += fabs(fileCoef[j*nWs + i]);
sigSum2 += fabs(rms_vecACC[j][i]);
}
}
sigSum = sigSum / lineCnt;
sigSum2 = sigSum2 / lineCnt;
mae = mae / lineCnt;
printf("MAE: %f\n", mae);
printf("Original SigSum: %f\n", sigSum);
printf("Obtained SigSum: %f\n", sigSum2);
//--------------------------------------------
myfile2.close();
//Free used memory in ACC cycle
delete [] fileCoef;
//TODO delete rms structure
for (int i = 0;i < ln+1; i++){
delete [] rms_vecACC[i];
}
delete [] rms_vecACC;
delete [] tfbACC;
delete [] inter_vec;
delete [] sub_vecACC;
delete [] paramsOUT;
delete [] cA;
delete [] cD;
delete [] filt;
delete [] w_vecACC;
wt_free(wt);
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
delete[] x;
}
double gtEnd = omp_get_wtime(); //Global time
double gtDur = gtEnd - gtBeg;
printf("Global execution time: %f seconds\n", gtDur);
return 0;
}
I’m executing the program with the same previously used audio file “audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav”, so the execution line is something like:
./pgimexa2 sweep.wav
And the compilation line also remains the same:
pgc++ -lsndfile -fast -ta=tesla -Minfo=accel pgimexa.2.cpp -o pgimexa2 libwavelib2.a
To compare if the obtained results are fine just download the text file sweepEner.txt at my GitHub: AMCM/sweepEner.txt at master · j-castro/AMCM · GitHub. So, if everything is fine the printed Mean Absolute Error (MAE) should be zero always. So finally the question is: Am I doing something wrong when executing asynchronously this code or is this some kind of compiler bug or something?
For good or bad, I’m not able to recreate the error. Granted I’m running on a P100, but I doubt it would be different than a K40.
Here’s my output below. I do see a MAE of 0.0 and a diff of sweep2Ener.txt to one that I created with a CPU only run match.
Note that I did download the sweepEner.txt file.
Did I miss something?
% grep async pgimexa.2.cpp #pragma acc update device(inter_vecACC[begI:endI-begI]) async(q%4) //**ASYNC COPYIN**
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //**ASYNC COPYIN**
//#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) asyn (q%4) //num_gangs(2056),num_workers(1),vector_length(32)
#pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
% pgc++ -L../libsndfile/lib/ -I../libsndfile/include -I. -lsndfile -fast -ta=tesla:cc60 pgimexa.2.cpp -o pgimexa2 libwavelib.a -w -V17.4
pgimexa.2.cpp:
% pgc++ -L../libsndfile/lib/ -I../libsndfile/include -I. -lsndfile -fast -ta=tesla:cc60 pgimexa.2.cpp -o pgimexa2 libwavelib.a -w -V17.4
pgimexa.2.cpp:
% pgimexa2 audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav
<<<Opening file: audiocheck.net_hdsweep_1Hz_48000Hz_-3dBFS_30s.wav>>>
Executing parallel wavelet...
Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------
Debug checkpoint #-1
Debug checkpoint #0
Debug checkpoint #1
BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 0.804450 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000000
Original SigSum: 0.127271
Obtained SigSum: 0.127271
Global execution time: 0.968645 seconds
Accelerator Kernel Timing data
Timing may be affected by asynchronous behavior
set PGI_ACC_SYNCHRONOUS to 1 to disable async() clauses
/local/home/colgrove/wavelib/src/pgimexa.2.cpp
main NVIDIA devicenum=0
time(us): 44,176
367: data region reached 2 times
34: kernel launched 5 times
grid: [1] block: [128]
device time(us): total=11 max=3 min=2 avg=2
elapsed time(us): total=690 max=581 min=22 avg=138
367: data copyin transfers: 2
device time(us): total=31 max=24 min=7 avg=15
421: data copyout transfers: 5
device time(us): total=63 max=15 min=12 avg=12
370: update directive reached 79 times
370: data copyin transfers: 79
device time(us): total=2,522 max=42 min=13 avg=31
370: compute region reached 79 times
370: kernel launched 79 times
grid: [16-128] block: [32]
device time(us): total=32,036 max=412 min=392 avg=405
elapsed time(us): total=34,059 max=459 min=421 avg=431
420: update directive reached 79 times
420: data copyout transfers: 79
device time(us): total=9,513 max=134 min=22 avg=120
-Mat
It looks it is ok, what is your compilation’s output? my compilation’s output is:
pgc++ -lsndfile -fast -ta=tesla -Minfo=accel pgimexa.2.cpp -o pgimexa2 libwavelib2.a
pgimexa.2.cpp:
"wavelib/wavefilt.h", line 31: warning: last line of file ends without a
newline
#endif /* WAVEFILT_H_ */
^
"wavelib/cwtmath.h", line 13: warning: allowing all exceptions is incompatible
with previous function "gamma" (declared at line 265 of
"/usr/include/bits/mathcalls.h")
double gamma(double x);
^
"pgimexa.2.cpp", line 217: warning: conversion from a string literal to "char
*" is deprecated
fileName = "aud/9manatees.wav";
^
"pgimexa.2.cpp", line 238: warning: conversion from a string literal to "char
*" is deprecated
wave_object family = wave_init("db8"); //wavelet family
^
"pgimexa.2.cpp", line 250: warning: conversion from a string literal to "char
*" is deprecated
wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
^
"pgimexa.2.cpp", line 248: warning: variable "dt" was declared but never
referenced
double dt = 1. / fs; //time step in seconds
^
"pgimexa.2.cpp", line 249: warning: variable "lastIdx" was declared but never
referenced
double lastIdx =0; //last index of processed portion of the signal
^
"pgimexa.2.cpp", line 200: warning: variable "fc" was declared but never
referenced
double fc = 2000; //cut-off frequency (Hz)
^
meanvACC(double *, int):
43, Generating Tesla code
47, #pragma acc loop vector /* threadIdx.x */
47, Loop is parallelizable
getSubvACC(double *, int, int, int, double *, int):
69, Generating Tesla code
73, #pragma acc loop vector /* threadIdx.x */
73, Loop is parallelizable
modwtACC2(int, int, int, double *, int, double *, double *, double *, double *, double *, double *):
80, Generating Tesla code
94, #pragma acc loop vector /* threadIdx.x */
99, #pragma acc loop seq
114, #pragma acc loop vector /* threadIdx.x */
120, #pragma acc loop vector /* threadIdx.x */
124, #pragma acc loop seq
126, #pragma acc loop seq
129, #pragma acc loop seq
141, #pragma acc loop vector /* threadIdx.x */
94, Loop is parallelizable
99, Loop carried scalar dependence for M at line 102,125,80
114, Loop is parallelizable
120, Loop is parallelizable
124, Loop carried scalar dependence for t at line 125,80
Complex loop carried dependence of paramsOUT->,filt->,cA->,cD-> prevents parallelization
Loop carried reuse of cD->,cA-> prevents parallelization
127, Accelerator restriction: induction variable live-out from loop: t
128, Accelerator restriction: induction variable live-out from loop: t
131, Accelerator restriction: induction variable live-out from loop: t
141, Loop is parallelizable
autocorrvACC(double *, int, int, int, double *, int):
152, Generating Tesla code
161, #pragma acc loop vector /* threadIdx.x */
168, #pragma acc loop vector /* threadIdx.x */
171, #pragma acc loop seq
161, Loop is parallelizable
168, Loop is parallelizable
171, Complex loop carried dependence of in->,out-> prevents parallelization
Loop carried reuse of out-> prevents parallelization
Loop carried dependence of out-> prevents parallelization
Loop carried backward dependence of out-> prevents vectorization
powvACC(double *, int, int, double *, int):
180, Generating Tesla code
184, #pragma acc loop vector /* threadIdx.x */
184, Loop is parallelizable
main:
366, Generating create(inter_vecACC[:xSz])
Generating copyin(lpd[:lpd_len])
Generating create(w_vecACC[:wvSz])
Generating copyout(rms_vecACC[:ln+1][:nWs])
Generating copyin(hpd[:lpd_len])
369, Generating update device(inter_vecACC[begI:endI-begI])
Accelerator kernel generated
Generating Tesla code
379, #pragma acc loop gang /* blockIdx.x */
390, #pragma acc loop vector(32) /* threadIdx.x */
396, #pragma acc loop seq
401, #pragma acc loop vector(32) /* threadIdx.x */
Generating reduction(+:energy)
390, Loop is parallelizable
396, Accelerator restriction: size of the GPU copy of paramsOUT is unknown
Loop is parallelizable
401, Loop is parallelizable
419, Generating update self(w_vecACC[(ln+1)*begI:(ln+1)*(endI-begI)])
Looks exactly the same.
Can you test it with a k40c? Because I’m running the same code and my output is:
./pgimexa2 sweep.wav
<<<Opening file: sweep.wav>>>
Executing parallel wavelet...
Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------
Debug checkpoint #-1
Debug checkpoint #0
Debug checkpoint #1
BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 2.070275 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000138
Original SigSum: 0.127271
Obtained SigSum: 0.127134
Global execution time: 2.325482 seconds
Also, If I comment out the “async” clause I get correct answers:
./pgimexa2 sweep.wav
<<<Opening file: sweep.wav>>>
Executing parallel wavelet...
Process constants -----------------
Signal length: 2880001
wszNS: 288 samples
Alag in number of smaples: [20.000000,120.000000]
-----------------------------------
Debug checkpoint #-1
Debug checkpoint #0
Debug checkpoint #1
BEFORE ACC REGION ----------
Number of windows: 10000
Number of blocks: 79
Residual of windows: 16
w_vecACC size: 14400005
w_vecACC update operation's size 14398565
lpd length: 16
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
AFTER ACC REGION ----------
Parallel execution time of OpenACC region: 2.053668 seconds
LPD and HPD mean values: (0.088388,0.000000)
LPD and HPD max: (0.675631,0.585355)
inter_vecACC max value is: 0.705475
inter_vecACC mean value is: 0.000207
levLens mean: 480.000000
w_vecACC max value is: 0.937597
w_vecACC min value is: -0.937109
w_vecACC mean value is: 0.000041
line_Cnt is: 50000 and rms_vecACC size is: 50000
MAE: 0.000000
Original SigSum: 0.127271
Obtained SigSum: 0.127271
Global execution time: 2.303171 seconds
So, the question is: Is there any problem in the way I’m using the async clause? Otherwise, it seems like the problem is related to the use of k40c GPU.
Did you try running it on a K80 and with the PGI community compiler???
Yes, and did get incorrect answers but I’m still try to determine what’s wrong, at least for some runs. Running the code repeatedly seems to sometimes get correct answers.
Note that I did find if I switched to using CUDA Unified Memory (-ta=tesla:managed), I get correct answers.
Also, if I set the environment varialbe “PGI_ACC_FILL=1” which initialize the GPU memory to zero, I’ll get consistent albeit wrong results (MAE = 0.001981).
Everything seems to point to some type of data race but still not sure where or why.
I’ll continue to review as a background task but I have a major project that I’m working on so it may be a day or so before I can get back to it.
-Mat
Thank you Mat, finally I solved it! I was reading the book: “Parallel Programming with OpenACC” and find out that the wait clause had to be placed inside the data region, so at the end of the OpenACC region I changed this:
.
.
.
//Update block of data
#pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
}
}
#pragma acc wait //ASYNC wait
.
.
.
for this:
.
.
.
//Update block of data
#pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async//(q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
}
#pragma acc wait //ASYNC wait
}
.
.
.
and everything worked perfectly and asynchronously!
Ugh, I don’t know why I missed that! Glad you were able to solve it.
-Mat
Hi Mat,
Again I have a strange bug that arises when using openACC. I just added 8 new lines of code in the OpenACC region, where I calculate the autocorrelation function and the RMS value if an energy threshold condition is satisfied:
if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level
}
else {
autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS,ACF_vecACC,avSz); //TODO change parameters
powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);
rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range
}
This code works if I compile it for multicore or sequential-GPU (using num_gangs(1), vector_length(1)), otherwise, it produces consistent wrong results. To quickly test the correctness of the code I use the files “9manatees.wav” and “manateeRMS.txt” that you can download at my GitHub GitHub - j-castro/AMCM: Automatic manatee count method. So again, the Mean Absolute Error (MAE) should be equal to zero.
This is the complete code:
/*
* Test file for modwt
*/
#include <omp.h>
#include <sndfile.h>
#include <iostream>
#include "wavelib/wavelib.h" //wavelib RAFAT - C wavelet library
#include <fstream>
using namespace std;
// return the value of the maximum value of the double * in
double maxvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] > val) {
val = in[i];
index = i;
}
}
return in[index];
}
// return the value of the manimum value of the double * in
double minvvACC(double* in, int sz) {
int i;
int index = 0;
double val = in[0];
for (i = 1; i < sz; i++) {
if (in[i] < val) {
val = in[i];
index = i;
}
}
return in[index];
}
// Return the mean of double array "in"
#pragma acc routine vector
double meanvACC(double* in, int sz) {
double sum = 0;
#pragma acc loop reduction(+:sum)
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
// Return the mean of int array "in"
//TODO parallelize
double meanvACCint(int* in, int sz) {
double sum = 0;
for(int i = 0;i < sz; i++) {
sum = sum + in[i];
}
return sum / sz;
}
//Get a sub-vector of array "x" from index "i1" to "i2"
//and store it in array "y"
#pragma acc routine vector
void getSubvACC(double* x, int szX, int i1, int i2, double *y, int szY){
if ((i2 >= i1) && (i2 < szX) && (i1 >= 0) && ((i2-i1+1) <= szY)) { //Parameters check
#pragma acc loop vector
for(int i = i1; i <= i2; i++){
y[i-i1] = x[i];
}
}
}
#pragma acc routine vector
void modwtACC2(int len_X, int level_N, int len_filt, double * lpd, int lpd_len, double * hpd, double *inp, double * cA, double * cD, double * filt,double * paramsOUT) {
int i;
int J;
int temp_len;
int iter;
int M;
int lenacc;
temp_len = len_X;
J = level_N;
M = 1;
#pragma acc loop vector
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = inp[i];
}
lenacc = (J + 1) * temp_len; //levLens[J + 1];
for (iter = 0; iter < J; iter++) {
lenacc = lenacc - temp_len;
if (iter > 0) {
M = 2 * M;
}
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
int l;
int i2;
int t;
double s;
s = sqrt(2.0);
#pragma acc loop vector
for (i2 = 0; i2 < lpd_len; i2++) {
filt[i2] = double(lpd[i2] / s);
filt[lpd_len + i2] = double(hpd[i2] / s);
}
#pragma acc loop vector //check if this is really parallelizable
for (i2 = 0; i2 < temp_len; i2++) {
t = i2;
cA[i2] = filt[0] * paramsOUT[t]; //TODO RESTORE!
cD[i2] = filt[lpd_len] * paramsOUT[t]; // TODO RESTORE!
for (l = 1; l < lpd_len; l++) {
t = t - M;
while (t >= temp_len) {
t = t - temp_len;
}
while (t < 0) {
t = t + temp_len;
}
cA[i2] = cA[i2] + filt[l] * paramsOUT[t];
cD[i2] = cD[i2] + filt[lpd_len + l] * paramsOUT[t];
}
}
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
//PASTED CODE - PASTED CODE - PASTED CODE - PASTED CODE--------------
#pragma acc loop vector
for (i = 0; i < temp_len; i++) {
paramsOUT[i] = cA[i];
paramsOUT[lenacc + i] = cD[i];
}
}
}
//Compute the sample autocorrelation of vector "in" from
//first lag "fl" to last lag "ll"
#pragma acc routine vector
void autocorrvACC(double* in, int szIn, int fl, int ll, double * out, int szOut){
if ((ll-fl+1) <= szOut) {
double mean = meanvACC(in,szIn); //Sample mean
double var = 0; //Sample variance
//Compute sample variance
#pragma acc loop reduction(+:var)
for (int i = 0; i < szIn; i++) {
var += (in[i]-mean)*(in[i]-mean);
}
var = var / szIn;
//Compute sample-ACF
#pragma acc loop vector
for(int i = fl;i <= ll; i++) {// Lag loop
out[i-fl] = 0;
for(int j = i;j < szIn; j++) { //Dot product
out[i-fl] += (in[j] - mean) * (in[j-i] - mean);
}
out[i-fl] = out[i-fl] / (var * szIn);
}
}
}
#pragma acc routine vector
void powvACC(double * in, int szIn, int n, double * out, int szOut) {
if (szIn <= szOut) {
#pragma acc loop vector
for(int i = 0;i < szIn; i++) {
out[i] = pow(in[i], n);
}
}
}
int main (int argc, char *argv[]) {
double gtBeg = omp_get_wtime(); //Global time
//Variables***************************************************
double * x; //noisy signal
int xSz; //size of variable x
double fs; //sample frequency (Hz)
double fc = 2000; //cut-off frequency (Hz)
double wsz = 3; //window size (milliseconds)
int ln = 4; //number of levels
double energy; //signal energy computed at each level/window
double startP, endP, durationP; //time
//***********************************************************
//Open a WAV*************************************************
FILE* wavf;
SF_INFO info;
char* fileName;
if (argc == 2) {
fileName = argv[1];
printf("\n<<<Opening file: %s>>>\n\n",argv[1]);
}
else {
fileName = "aud/9manatees.wav";
printf("\n<<<Using default file: aud/9manatees.wav>>>\n\n");
}
wavf = fopen(fileName, "r");
if(wavf != NULL){ //Succesful wav read
SNDFILE* source = sf_open_fd(fileno(wavf), SFM_READ, &info, 1);
fs = info.samplerate;
xSz = info.frames;
x = new double[xSz];
sf_read_double(source, x, xSz);
sf_close(source);
fclose(wavf);
//Denoising method begins*****************************************************
printf("\nExecuting parallel wavelet... \n\n");
//Process constants-------------------
wave_object family = wave_init("db8"); //wavelet family
double ZERO_CONST = pow(10,-14); //represents minimum signal energy
double wszNS = llround(wsz*0.001*fs); //window size in # samples
int wszINT = wszNS;
//--------------------------------------
//Initialize Variables-------------------------------------------------
double alagBegNS = round(0.21 *0.001*fs); //autocorrelation lag beggining
double alagEndNS = round(1.25 *0.001*fs); //autoorrelation lag end
double * inter_vec = new double[xSz]; //intermediate output vector
double dt = 1. / fs; //time step in seconds
double lastIdx =0; //last index of processed portion of the signal
wt_object wt = wt_init(family,"modwt", wszNS, ln); //undecimated wavelet transform object
//DEBUG - Print basic info -----------------------------------
printf("\nProcess constants ----------------- \n");
printf("Signal length: %d\n", xSz);
printf("wszNS: %g samples\n", wszNS);
printf("Alag in number of smaples: [%f,%f]\n",alagBegNS,alagEndNS);
printf("----------------------------------- \n\n");
//--------------------------------------------------------------------
//Allocate all memory before OpenACC region
//Size of vectors in number of samples
int svSz = wszINT; //size of temporal subvector
int wsvSz = wszINT*(ln+1); //size of wavelet coefficients of the temporal subvector
int wvSz = xSz*(ln+1); //size of wavelet coefficients of the entire signal
int avSz = alagEndNS - alagBegNS + 1; //size of autocorrelation vector
int tfbSz = wszINT;
//Size of blocks and windows
int bSz = 128; //block size in number of windows
int nWs = floor(xSz/wszINT); //number of windows
int nBlocks = ceil( floor(xSz/wszINT) / bSz );//number of blocks
int resWsz = nWs%bSz; //residual of the # of windows divided by the block size
//OpenACC variables-------------------------------------------------------------------------------------
double * restrict sub_vecACC = new double[svSz]; //a temporal subvector of the input signal
double * restrict inter_vecACC = x; //inter_vec; //a pointer to the original input //TODO RESTORE!!
double * restrict w_vecACC = new double[wvSz]; //UDWT of the entire signal
double * restrict tfbACC = new double[tfbSz]; //a temporal vector to store one frequency channel of sub-vector sub_vecACC
printf("\nDebug checkpoint #-1\n");
double * restrict ACF_vecACC = new double[avSz] ; //vector that store the autocorrelation function result
printf("\nDebug checkpoint #0\n");
double ** restrict rms_vecACC = new double*[ln+1]; //rms value matrix of ACF at each level of each time window
//------------------------------------------------------------------------------------------------------
printf("\nDebug checkpoint #1\n");
//Initialization of matrix
for(int i = 0; i < ln+1; i++){
rms_vecACC[i] = new double[nWs];
/*for (int j = 0; j < nWs; j++) { Not necessary
rms_vecACC[i][j] = 0;
}*/
}
//Initialization of wavelet coefficient's vector
for (int yy = 0; yy < xSz*(ln+1); yy++){
w_vecACC[yy] = 0;
}
//Input parameters of modwtACC2-------------------------------------
int len_X = wt->siglength; //Length of input signal X
int level_N = wt->J; //Number of levels
int len_filt = wt->wave->filtlength;//Wave-filter length
int * levLens = wt->length; //Length of each level
double * lpd = wt->wave->lpd; //Low Pass Details
int lpd_len = wt->wave->lpd_len; //lpd length
double * hpd = wt->wave->hpd; //High Pass Details
//Dynamic memory inputs: allocate and free memory outside ACC-loop
double * cA = new double[len_X]; //free(cA);
double * cD = new double[len_X]; //free(cD);
double * filt = new double[2 * lpd_len]; //free(filt)
//Output parameters:
double * paramsOUT = new double[wsvSz];
//------------------------------------------------------------------
//----------------------------------------------------------------
printf("\nBEFORE ACC REGION ----------\n");
printf("Number of windows: %d\n",nWs);
printf("Number of blocks: %d\n",nBlocks);
printf("Residual of windows: %d\n",resWsz);
printf("w_vecACC size: %d\n", xSz*(ln+1));
printf("w_vecACC update operation's size %d\n", (xSz - wszINT)*(ln+1));
printf("lpd length: %d\n", lpd_len);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
//Set levLens before cycle-------------------------
levLens[level_N +1]= (level_N +1)*len_X;
for (int i = 0; i < level_N+1; i++) {
levLens[i] = len_X;
}
//--------------------------------------------------
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
//----------------------------------------------------------------------------
//DEBUG - READ ORIGINAL WAVELET COEFFICIENTS----------------------------------------
ifstream myFile; //DEBUG CODE
myFile.open ("manateeRMS.txt"); //DEBUG CODE
int lineCnt = 0;
string line;
while (getline(myFile,line))
++lineCnt;
myFile.clear();
myFile.seekg(0,ios::beg);
double * fileCoef = new double[lineCnt];
for (int i=0; i < lineCnt; i++) {
myFile >> fileCoef[i]; //DEBUG CODE
}
myFile.close(); //DEBUG CODE
//----------------------------------------------------------------------------------
// DEBUG - WRITE WAVELET COEFFICIENTS-----------------------------------------------
ofstream myfile2;
myfile2.open ("manatee2RMS.txt"); //DEBUG CODE*/
startP = omp_get_wtime();
//----------------------------------------------------------------------------------
#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len]), copyout(rms_vecACC[:ln+1][:nWs]), create(w_vecACC[:wvSz],inter_vecACC[:xSz]) //**ASYNC COPYIN**
//#pragma acc data copyin(inter_vecACC[:xSz],hpd[:lpd_len],lpd[:lpd_len]), create(w_vecACC[:wvSz])
{
for (int q = 0; q < nBlocks; q++) { //For each block of windows
int begI = q * bSz * wszINT; //block beggining
int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
/*printf("wszINT: %d\n", wszINT);
printf("wszNS: %f\n", wszNS);
printf("begI: %d\n", begI);
printf("endI: %d\n", endI);
printf("wsvSz: %d\n", wsvSz);*/
#pragma acc update device(inter_vecACC[begI:endI-begI]) async (q%4) //**ASYNC COPYIN**
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async (q%4) //**ASYNC COPYIN** //num_gangs(1), vector_length(1)
//#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],cA[:len_X],cD[:len_X],filt[:2*lpd_len]) async(q%4) //num_gangs(2056),num_workers(1),vector_length(32)
for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
int val = i + wszINT -1; //added to avoid a pgi compiler's bug
//Get subvector (window)
getSubvACC(inter_vecACC, xSz,i,val,sub_vecACC, svSz);
//Apply UDWT to window
modwtACC2(len_X,level_N,len_filt,lpd,lpd_len,hpd,sub_vecACC,cA,cD,filt,paramsOUT);
//Store UDWT coefficients
#pragma acc loop
for (int j=0; j < wsvSz; j++) {
w_vecACC[i*(ln+1) + j] = paramsOUT[j];
}
//ACF and rms value computation for each level
#pragma acc loop private(tfbACC[:tfbSz],ACF_vecACC[:avSz],energy,i)
for (int j=0;j < ln+1; j++) {
energy = 0;
#pragma acc loop reduction(+:energy)
for (int k=0; k < wszINT; k++) { //Obtain level j's coefs and estimate signal energy
tfbACC[k] = paramsOUT[k + (j*wszINT)];
energy += pow(tfbACC[k],2);
}
energy = sqrt(energy / wszNS); //signal energy in window "i" and level "j"
if (energy < ZERO_CONST) { //if minimun energy is not reached, asign zero score
rms_vecACC[j][i/wszINT] = 0; //zero score to the actual window and level
}
else {
autocorrvACC(tfbACC,tfbSz,alagBegNS,alagEndNS, ACF_vecACC,avSz); //TODO change parameters
powvACC(ACF_vecACC,avSz,2,ACF_vecACC,avSz);
rms_vecACC[j][i/wszINT] = sqrt(meanvACC(ACF_vecACC,avSz)); //store rms value of ACF on [alag[0] alag[1]] range
}
}
}
//Update block of data
#pragma acc update self(w_vecACC[begI*(ln+1):(endI-begI)*(ln+1)]) async (q%4) //,rms_vecACC[:ln+1][begI/wszINT:(endI-begI)/wszINT]
}
#pragma acc wait //ASYNC wait
}
endP = omp_get_wtime();
durationP = endP - startP;
printf("\nAFTER ACC REGION ----------\n");
printf("Parallel execution time of OpenACC region: %f seconds\n", durationP);
printf("LPD and HPD mean values: (%f,%f)\n", meanvACC(lpd,lpd_len),meanvACC(hpd,lpd_len));
printf("LPD and HPD max: (%f,%f)\n",maxvvACC(lpd,lpd_len),maxvvACC(hpd,lpd_len));
printf("inter_vecACC max value is: %f\n", maxvvACC(inter_vecACC,xSz));
printf("inter_vecACC mean value is: %f\n", meanvACC(inter_vecACC,xSz));
printf("levLens mean: %f\n", meanvACCint(levLens,level_N + 2));
printf("w_vecACC max value is: %f\n", maxvvACC(w_vecACC, wvSz));
printf("w_vecACC min value is: %f\n", minvvACC(w_vecACC, wvSz));
printf("w_vecACC mean value is: %f\n", meanvACC(w_vecACC, wvSz));
//printf("line_Cnt is: %d and rms_vecACC size is: %d\n",lineCnt, nWs*(ln+1));
//Compare wavelet coefficients----------------
double mae = 0;
double sigSum = 0;
double sigSum2 = 0;
/*for (int i = 0; i < wvSz; i++) { //Comparison for 1 dimensional vectors
myfile2 << w_vecACC[i] << "\n";
mae += fabs(fileCoef[i] - w_vecACC[i]);
sigSum += fabs(fileCoef[i]);
sigSum2 += fabs(w_vecACC[i]);
}*/
for (int j = 0;j<ln+1; j++) {
for (int i = 0;i<nWs; i++) { //Comparison for 2 dimensional vectors
myfile2 << rms_vecACC[j][i] << "\n";
mae += fabs(fileCoef[j*nWs + i] - rms_vecACC[j][i]);
sigSum += fabs(fileCoef[j*nWs + i]);
sigSum2 += fabs(rms_vecACC[j][i]);
}
}
sigSum = sigSum / lineCnt;
sigSum2 = sigSum2 / lineCnt;
mae = mae / lineCnt;
printf("MAE: %f\n", mae);
printf("Original SigSum: %f\n", sigSum);
printf("Obtained SigSum: %f\n", sigSum2);
//--------------------------------------------
myfile2.close();
//Free used memory in ACC cycle
delete [] fileCoef;// */
//TODO delete rms structure
for (int i = 0;i < ln+1; i++){
delete [] rms_vecACC[i];
}
delete [] rms_vecACC;
delete [] tfbACC;
delete [] inter_vec;
delete [] sub_vecACC;
delete [] paramsOUT;
delete [] cA;
delete [] cD;
delete [] filt;
delete [] w_vecACC;
wt_free(wt);
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
delete[] x;
}
double gtEnd = omp_get_wtime(); //Global time
double gtDur = gtEnd - gtBeg;
printf("Global execution time: %f seconds\n", gtDur);
return 0;
}
Looks to be a problem with autocorrvACC calling meanvACC. If I change meanvACC to be a “seq” routine, then the MAE is zero.
It’s legal OpenACC to have meanvACC be vector, but we may have a compiler issue due to the reduction. I’ll get it reported to engineering.
-Mat
Hi Mat,
Thank you for all your help, I’m almost finished with this parallelization task, but now I have another bug, which I suspect can be a compiler issue. I wrap all my code in my Github: AMCM/pgi at master · j-castro/AMCM · GitHub to be easier to compile for you. So just download all the sources and files and compile
make pgimexa2
and then run the executable
./pgimexa2
(You may need to recompile RAFAT’s library (libwavelib2.a)).
So, if I disable OpenACC directives it works perfectly and the MAE is equal to zero. If I enable OpenACC directives I have a execution problem:
./pgimexa2
<<<Using default file: aud/9manatees.wav>>>
Parallel execution time of OpenACC region #1: 0.229746 seconds
Parallel execution time of OpenACC region #2: 0.069345 seconds
call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
In the third and last stage of the program:
//3- Wavelet thresholding ----------------------------------------------------------------------------------------------------------
#pragma acc data copyin(hpd[:lpd_len],lpd[:lpd_len],rms_vecACC[:ln+1][:nWs]), copyout(y[:xSz]), present(w_vecACC) //**ASYNC COPYIN**
{
for (int q = 0; q < nBlocks; q++) { //For each block of windows
int begI = q * bSz * wszINT; //block beggining
int endI = min(begI + (bSz * wszINT),((nBlocks-1) * bSz * wszINT) + (resWsz * wszINT)); //block end
#pragma acc parallel loop gang private(sub_vecACC[:svSz],paramsOUT[:wsvSz],temp_vecACC3[:svSz],filt[:2*lpd_len],tempX[len_X])
for (int i = begI ;i < endI; i = i + wszINT) { //For each window compute the ACF-rms value in the wavelet domain
val1 = i*(ln+1); //added to avoid a pgi compiler's bug
val2 = (i+wszINT)*(ln+1)-1; //added to avoid a pgi compiler's bug
//3.1 Get UDWT coefficients
getSubvACCpgi(w_vecACC, wvSz,val1,val2,paramsOUT, wsvSz);
val1 = ln*wszINT;
val2 = ln*wszINT + wszINT-1;
//3.2 Compute Universal threshold
getSubvACCpgi(paramsOUT, wsvSz,val1,val2,sub_vecACC, svSz); //Obtain highest level coefficients
univ = madvACC(sub_vecACC, svSz, temp_vecACC3) * sqrt(2*log(wszNS)); // 0.6745 decide if using this factor
//3.3 Apply soft thresholding on wavelet coefficients
for (int j=0; j < ln+1; j++) { //for each level
//get all wavelet coef. from level j
#pragma acc loop
for (int k=0; k < wszINT; k++)
sub_vecACC[k] = paramsOUT[k + (j*wszINT)];
//Determine threshold to apply
if (rms_vecACC[j][i/wszINT] == 0) {//if belongs to class 0
multv2ACC(sub_vecACC,svSz,0); //set all coefficients to zero (infinite threshold)
}
else {
softThresvACC(sub_vecACC,svSz,univ); //apply soft thresholding rule using universal threshold
}
//Copyback all thresholded wavelet coef. from level j to sub_vector
#pragma acc loop
for (int k=0; k < wszINT; k++)
paramsOUT[k + (j*wszINT)] = sub_vecACC[k];
}
//3.4 Apply Inverse UDWT
#pragma acc loop
for (int j=0; j < svSz; j++) { //Copy first level coefficients to sub_vecACC (Initialization for imodwtACC)
sub_vecACC[j] = paramsOUT[j];
}
imodwtACC(len_X,level_N, lpd,lpd_len, hpd, paramsOUT, filt, tempX, sub_vecACC); //Inverse UDWT
//concatenate result to output vector
#pragma acc loop
for (int j=0; j< svSz;j++) {
y[i+j] = sub_vecACC[j];
}
lastIdx = i + svSz;
}
}
}
//-----------------------------------------------------------------------------------------------------------------------------------------------
#pragma acc exit data delete(w_vecACC)
So is there a compiler bug or I am missing something???
By the way, if I compile it for multicore it works correctly:
./pgimexa2
<<<Using default file: aud/9manatees.wav>>>
Parallel execution time of OpenACC region #1: 0.370766 seconds
Parallel execution time of OpenACC region #2: 0.078671 seconds
Parallel execution time of OpenACC region #3: 0.296613 seconds
MAE: 0.000000
Original SigSum: 0.004336
Obtained SigSum: 0.004336
Global execution time: 2.152441 seconds
I was able to recreate the illegal address error in 17.10 but it seems to go away in 18.1. 18.1’s MAE is a bit off but this might be an issue with the program since if I use a vector_length==1, then the MAE is zero. I’m a bit swamped right now, but I’ll try to find some time to track down why using vector parallelism the code gets wrong answers.
On a side note, I ran your code through valgrind and noticed that you have an out-of-bounds read access in vec.cpp at lines 1225 and 1127. The value of “i” can be zero here so accessing “f.v[i-1]” is off the beginning of the array. I highly doubt it’s causing the MAE wrong answers, but I thought I would point it out.
x0 = f.v[i-1]; // line 1225
x1 = f.v[i];
y0 = g.v[i-1];
y1 = g.v[i];
y.v[a] = y0 + (((x.v[a]- x0)*y1 -(x.v[a] -x0)*y0)/(x1-x0));//Formula de interpolacion lineal
-Mat
Thank you Mat, I solved the issue with the MAE a bit off by adjusting the value of variable “lastIdx”, so I already update pgimexa.2.cpp on the Github with this change. Also thank you for your note on vec.cpp! I hope you can help me ass soon as you can by tracking the illegal address error but of course I understand you’re busy.
Hey Mat, did you have a chance to look at the execution error?
I did but wasn’t able to determine the root cause. Though since the error no longer occurs in 18.1, I assume it was some compiler issue that has since been fixed.
Were you able to try using 18.1?
-Mat
I don’t have access to compiler 18.1 since the latest community edition version is 17.1. Is there any way to have at least a trial version of 18.1 to test it?