Hi! I have this code:
#pragma acc data copyin(image[0:lines*samples*bands]) \
copyout(count[0:FPS]) \
create(meanSpect[0:bands], Cov[0:bands*bands], Corr[0:bands*bands], CovEigVal[0:bands], CorrEigVal[0:bands], U[0:bands*bands], VT[0:bands*bands]) // Daba mejor rendimiento en copyin todo
{
#pragma acc parallel loop vector
for(i=0; i<bands; i++)
{
mean=0;
#pragma acc loop reduction(+:mean)
for(j=0; j<N; j++)
mean+=(image[(i*N)+j]);
mean/=N;
meanSpect[i]=mean;
#pragma acc loop
for(j=0; j<N; j++)
image[(i*N)+j]=image[(i*N)+j]-mean;
}
double alpha = (double)1/N, beta = 0;
#pragma acc host_data use_device(image, Cov)
{
// dgemm_("T", "N", &bands, &bands, &N, &alpha, image, &N, image, &N, &beta, Cov, &bands);
cublasDgemm(handle_gemm,CUBLAS_OP_T, CUBLAS_OP_N, bands, bands, N, &alpha, image, N, image, N, &beta, Cov, bands);
}
//CORRELATION
#pragma acc parallel loop collapse(2) // Menudo collapse to wapo eh (junta los for)
for(j=0; j<bands; j++)
for(i=0; i<bands; i++)
Corr[(i*bands)+j] = Cov[(i*bands)+j]+(meanSpect[i] * meanSpect[j]);
//SVD
// Declaraciones movidas arriba
#pragma acc host_data use_device(Cov, Corr, CovEigVal, CorrEigVal, U, VT)
{
//dgesvd_("N", "N", &bands, &bands, Cov, &bands, CovEigVal, U, &bands, VT, &bands, work, &lwork, &info);
cusolverDnDgesvd(cusolverHandle,"N", "N", bands, bands, Cov, bands, CovEigVal, U, bands, VT, bands, work, lwork, rwork, info);
//dgesvd_("N", "N", &bands, &bands, Corr, &bands, CorrEigVal, U, &bands, VT, &bands, work, &lwork, &info);
cusolverDnDgesvd(cusolverHandle,"N", "N", bands, bands, Corr, bands, CorrEigVal, U, bands, VT, bands, work, lwork, rwork, info);
}
//ESTIMATION
// Declaracion de count movida arriba
double e;
#pragma acc loop seq
for(i=0; i<FPS; i++) count[i] = 0; // Esto probablemente no renta paralelizarlo
#pragma acc loop seq // Estaria bien paralelizarlo pero no he encontrado forma
for(i=0; i<bands; i++)
{
// Si cada hilo tiene una variable distinta de sigmaSquareTest se puede paralelizar
sigmaSquareTest = (CovEigVal[i]*CovEigVal[i]+CorrEigVal[i]*CorrEigVal[i])*2/samples/lines;
sigmaTest = sqrt(sigmaSquareTest);
// No merece la pena paralelizarlo ya que FPS es un define y es un numero bajo (5), ademas tampoco se deberia poder paralelizar el array count
#pragma acc loop seq
for(j=1;j<=FPS;j++)
{
switch(j)
{
case 1: e = 0.906193802436823;
break;
case 2: e = 1.644976357133188;
break;
case 3: e = 2.185124219133003;
break;
case 4: e = 2.629741776210312;
break;
case 5: e = 3.015733201402701;
break;
}
TaoTest = sqrt(2) * sigmaTest * e;
if((CorrEigVal[i]-CovEigVal[i]) > TaoTest)
count[j-1]++;
}
}
#pragma acc data present(count[0:FPS])
{
for(i = 0; i < FPS; i++) printf("\n%d\n",count[i]);
}
}
And I know that the three sequential loops after “// ESTIMATION” are not being executed in the GPU. Here is the thing: I want the count variable to be executed in the GPU (or device) and then copy it back to the host (with the copyout which is at the top).
Now after the whole region I have put above if I print the count array everything is set to 0s. However, if I remove the copyout or change it for a copyin and I do the same print after the parallel region, then count isn’t 0 anymore, and has its values.
This is the output of the make process, I’m using PGCC:
404, Generating copyout(count[:5]) [if not already present]
Generating create(VT[:bands*bands],meanSpect[:bands]) [if not already present]
Generating copyin(image[:bands*(samples*lines)]) [if not already present]
Generating create(CovEigVal[:bands],CorrEigVal[:bands],Corr[:bands*bands],Cov[:bands*bands],U[:bands*bands]) [if not already present]
406, Loop is parallelizable
Generating Tesla code
406, #pragma acc loop vector(128) /* threadIdx.x */
411, #pragma acc loop gang /* blockIdx.x */
Generating reduction(+:mean)
418, #pragma acc loop gang /* blockIdx.x */
433, Generating Tesla code
433, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
434, /* blockIdx.x threadIdx.x collapsed */
454, Generated vector simd code for the loop
Residual loop unrolled 1 times (completely unrolled)
457, Loop not vectorized/parallelized: not countable
488, Generating present(count[:5])
489, Loop not vectorized/parallelized: contains call
493, Loop not vectorized/parallelized: contains call
What count variable is being edited in the code? Why? How do I fix it?
Thank you!