[CUDA 4.0, Windows Vista SP2, 430 GT, nvcc -arch compute_20 -code compute_20]
Hi,
I wrote one of my first programs. And there is surely something I missed.
I’ve written a kernel in charge of returning the max-element of a matrix (in order to normalize it later).
All the threads are in a same block.
The algorithm begins by doing a shared working copy of the matrix. Then, progressively, the max-element migrates to the first cell of the working copy.
The algorithm seems to be valid from all the traces I inserted. A minority of the threads get a good result : the first ones of the block.
But the others seems to report the intermediate results.
He is my code :
__device__ float maxKernel(float* argMatrice, unsigned argColonnes, unsigned argLignes, unsigned argX, unsigned argY, unsigned argPitchCellules)
{
const unsigned offsetCelluleCourante = argY * argPitchCellules + argX;
const unsigned taille_de_la_Matrice = argPitchCellules * argLignes;
unsigned offsetColonneSuivanteParRapportCourante = +1;
unsigned offsetLigneSuivanteParRapportCourante = +argPitchCellules;
const unsigned offsetLimitesLignes = argPitchCellules * argLignes;
__shared__ float* matrice;
if (!(threadIdx.x | threadIdx.y | threadIdx.z))
{
matrice = (float*) malloc(taille_de_la_Matrice * sizeof(float));
if (matrice == NULL)
{
printf("malloc ECHEC\n");
}
else
{
printf("malloc SUCCES\n");
}
memcpy(matrice, argMatrice, taille_de_la_Matrice * sizeof(float));
*matrice = -1.0;
}
__syncthreads();
float* ptrCelluleCourante = matrice + offsetCelluleCourante;
bool offsetColonneSuivanteEstDansMatrice = offsetColonneSuivanteParRapportCourante < argColonnes;
bool offsetLigneSuivanteEstDansMatrice = ptrCelluleCourante
+
offsetLigneSuivanteParRapportCourante
<
matrice + offsetLimitesLignes;
unsigned parite = argX | argY;
while (!(parite & 1) && (offsetColonneSuivanteEstDansMatrice || offsetLigneSuivanteEstDansMatrice))
{
*ptrCelluleCourante = abs(*ptrCelluleCourante);
if (offsetColonneSuivanteEstDansMatrice)
{
*ptrCelluleCourante = max(*ptrCelluleCourante, abs(*(ptrCelluleCourante
+
offsetColonneSuivanteParRapportCourante)));
printf("max(%f, %f) (0,+%d)\n", *ptrCelluleCourante, *(ptrCelluleCourante + offsetColonneSuivanteParRapportCourante),
offsetColonneSuivanteParRapportCourante);
}
if (offsetLigneSuivanteEstDansMatrice)
{
*ptrCelluleCourante = max(*ptrCelluleCourante, abs(*(ptrCelluleCourante
+
offsetLigneSuivanteParRapportCourante)));
printf("max(%f, %f) (+%d,0)\n", *ptrCelluleCourante, *(ptrCelluleCourante + offsetLigneSuivanteParRapportCourante),
offsetLigneSuivanteParRapportCourante / argPitchCellules);
}
if (offsetLigneSuivanteEstDansMatrice && offsetColonneSuivanteEstDansMatrice)
{
*ptrCelluleCourante = max(*ptrCelluleCourante, abs(*(ptrCelluleCourante
+
offsetColonneSuivanteParRapportCourante
+
offsetLigneSuivanteParRapportCourante)));
printf("max(%f, %f) (+%d,+%d)\n", *ptrCelluleCourante, *(ptrCelluleCourante
+
offsetColonneSuivanteParRapportCourante
+
offsetLigneSuivanteParRapportCourante),
offsetLigneSuivanteParRapportCourante / argPitchCellules,
offsetColonneSuivanteParRapportCourante
);
}
offsetColonneSuivanteParRapportCourante <<= 1;
offsetColonneSuivanteEstDansMatrice = offsetColonneSuivanteParRapportCourante < argColonnes;
offsetLigneSuivanteParRapportCourante <<= 1;
offsetLigneSuivanteEstDansMatrice = ptrCelluleCourante
+
offsetLigneSuivanteParRapportCourante
<
matrice + offsetLimitesLignes;
parite >>= 1;
__syncthreads();
}
__syncthreads();
float the_max = *matrice;
printf("block (%d, %d, %d), thread(%d, %d, %d) max = %f (%p)\n",
blockIdx.x, blockIdx.y, blockIdx.z,
threadIdx.x, threadIdx.y, threadIdx.z, *matrice, matrice);
__syncthreads();
if (!(threadIdx.x | threadIdx.y | threadIdx.z))
{
free(matrice);
}
return the_max;
}
And here is the result on screen (filted to see the result of the last printf instruction of the kernel)
block (0, 0, 0), thread(384, 0, 0) max = 1.269481 (05CFAB20)
.
.
block (0, 0, 0), thread(159, 0, 0) max = 1.269481 (05CFAB20)
block (0, 0, 0), thread(224, 0, 0) max = 1.460979 (05CFAB20)
.
.
block (0, 0, 0), thread(287, 0, 0) max = 1.460979 (05CFAB20)
block (0, 0, 0), thread(160, 0, 0) max = 1.715319 (05CFAB20)
.
.
block (0, 0, 0), thread(191, 0, 0) max = 1.715319 (05CFAB20)
block (0, 0, 0), thread(320, 0, 0) max = 1.941399 (05CFAB20)
.
.
block (0, 0, 0), thread(31, 0, 0) max = 1.941399 (05CFAB20)
What’s wrong in my implementation ?
Thanks a lot