Hello, I’m sorry for my bad english, I’m french.
I have a problem with cudaArray & textures.
The next function is launched several times on many different datas. These datas are transferred to GPU global memory to make bilinear interpolation on it.
The first time the function is launched, the result of bilinear interpolation is ok. But the next times, the first line of the first image transfered to GPU is
yet binded with texture and so the interpolation is not ok.
To be practical, the first line of the texture is the same over iterration.
I can take pictures of my problem if necessary.
I hope I was clear.
// Executed several times
void gpuRunPartialZNCC( tile *tile1, tile *tile2, measure *listMeasure, int nbParam, int numTile,
double *h_sumIm1, double *h_sumIm2, double *h_squareSumIm1, double *h_squareSumIm2, double *h_crossSum, interpolateMode interMode)
{
// Tuiles et imagette extraite GPU
float *d_extract;
cudaArray *d_tile2;
// Taille des zones mémoires
size_t sizeTemp;
// Descritption de Canal pour la texture
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
// Dimension du GPU
dim3 blockSizeRED( NBTREDUC );
dim3 gridSizeRED( ((tile1->size.x * tile1->size.y) / NBTREDUC) / 2 );
int smemSizeRED = NBTREDUC*sizeof(float);
dim3 blockSizeEXT(NBTINTER,NBTINTER);
dim3 gridSizeEXT(tile1->size.x/NBTINTER, tile1->size.y/NBTINTER);
// Allocation mémoire et copie des images sur le GPU
sizeTemp = tile1->size.x * tile1->size.y * sizeof(float);
cudaMemcpy( d_tile1, tile1->data, sizeTemp, cudaMemcpyHostToDevice );
cudaMalloc( (void **)&d_extract, sizeTemp );
sizeTemp = tile2->size.x * tile2->size.y * sizeof(float);
cudaMallocArray( &d_tile2, &channelDesc, tile2->size.x, tile2->size.y );
cudaMemcpyToArray( d_tile2, 0, 0, (void*)(tile2->data), sizeTemp, cudaMemcpyHostToDevice);
// Bind de la texture
texture<float, 2, cudaReadModeElementType> &myTexture = getTexture();
if (interMode == BILINEAR)
myTexture.filterMode = cudaFilterModeLinear; // Interpolation intégrée
else
myTexture.filterMode = cudaFilterModePoint;
myTexture.normalized = false;
cudaBindTextureToArray( myTexture, d_tile2, channelDesc);
for ( int measureCurent = 0; measureCurent < nbParam; measureCurent++)
{
// Extraction de l'imagette
extractImg(d_extract, tile1->size, tile1->offset, tile2->offset, listMeasure[measureCurent].param, gridSizeEXT, blockSizeEXT);
}
cudaUnbindTexture( myTexture );
// Libération mémoire
cudaFree( d_extract );
cudaFreeArray( d_tile2 );
}
Kernel :
__global__ void extraction(float *d_out, int dimOut, int im1offsetX, int im1offsetY, int im2offsetX, int im2offsetY, float ax, float ay, float bx, float by, float cx, float cy) {
int ix = IMUL(blockDim.x, blockIdx.x) + threadIdx.x + im1offsetX;
int iy = IMUL(blockDim.y, blockIdx.y) + threadIdx.y + im1offsetY;
float xp = ax*ix + bx*iy + cx + .5 - im2offsetX; // +.5 => Correction de l'emplacement du pixel sur GPU
float yp = ay*ix + by*iy + cy + .5 - im2offsetY;
d_out[iy*dimOut + ix] = tex2D(getDeviceTexture(), xp, yp);
}
void extractImg(float *d_extract, coord2D extractSize, coord2D tile1Offset, coord2D tile2Offset, polynomialParam parameters, dim3 gridSize, dim3 blockSize)
{
float ax = parameters.ax;
float ay = parameters.ay;
float bx = parameters.bx;
float by = parameters.by;
float cx = parameters.cx;
float cy = parameters.cy;
printf("\nOffset image1 : %d, %d\t Offset image2 : %d, %d\n\n", tile1Offset.x, tile1Offset.y, tile2Offset.x, tile2Offset.y);
extraction<<< gridSize, blockSize >>>(d_extract, extractSize.x, tile1Offset.x, tile1Offset.y, tile2Offset.x, tile2Offset.y, ax, ay, bx, by, cx, cy);
cudaThreadSynchronize();
}
Thanks for your help.