Problem with cudaArray 2D and texture Datas are not completly unbind?

Hello, I’m sorry for my bad english, I’m french.

I have a problem with cudaArray & textures.

The next function is launched several times on many different datas. These datas are transferred to GPU global memory to make bilinear interpolation on it.

The first time the function is launched, the result of bilinear interpolation is ok. But the next times, the first line of the first image transfered to GPU is

yet binded with texture and so the interpolation is not ok.

To be practical, the first line of the texture is the same over iterration.

I can take pictures of my problem if necessary.

I hope I was clear.

// Executed several times

void gpuRunPartialZNCC( tile *tile1, tile *tile2, measure *listMeasure, int nbParam, int numTile,

					   double *h_sumIm1, double *h_sumIm2, double *h_squareSumIm1, double *h_squareSumIm2, double *h_crossSum, interpolateMode interMode)

{

	// Tuiles et imagette extraite GPU

	float *d_extract;

	cudaArray *d_tile2;

	// Taille des zones mémoires

	size_t sizeTemp;

	// Descritption de Canal pour la texture

	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	// Dimension du GPU

	dim3	blockSizeRED( NBTREDUC );

	dim3	gridSizeRED( ((tile1->size.x * tile1->size.y) / NBTREDUC) / 2 );

	int		smemSizeRED = NBTREDUC*sizeof(float);

	dim3	blockSizeEXT(NBTINTER,NBTINTER);

	dim3	gridSizeEXT(tile1->size.x/NBTINTER, tile1->size.y/NBTINTER);

	// Allocation mémoire et copie des images sur le GPU

	sizeTemp = tile1->size.x * tile1->size.y * sizeof(float);

	cudaMemcpy( d_tile1, tile1->data, sizeTemp, cudaMemcpyHostToDevice );

	

	cudaMalloc( (void **)&d_extract, sizeTemp );

	sizeTemp = tile2->size.x * tile2->size.y * sizeof(float);

	cudaMallocArray( &d_tile2, &channelDesc, tile2->size.x, tile2->size.y );

	cudaMemcpyToArray( d_tile2, 0, 0, (void*)(tile2->data), sizeTemp, cudaMemcpyHostToDevice);

	// Bind de la texture

	texture<float, 2, cudaReadModeElementType> &myTexture = getTexture();

	if (interMode == BILINEAR)

		myTexture.filterMode = cudaFilterModeLinear;	// Interpolation intégrée

	else

		myTexture.filterMode = cudaFilterModePoint;

	myTexture.normalized = false;

	cudaBindTextureToArray( myTexture, d_tile2, channelDesc);

	for ( int measureCurent = 0; measureCurent < nbParam; measureCurent++) 

	{

		// Extraction de l'imagette

		extractImg(d_extract, tile1->size, tile1->offset, tile2->offset, listMeasure[measureCurent].param, gridSizeEXT, blockSizeEXT);

	}

	cudaUnbindTexture( myTexture );

	// Libération mémoire

	cudaFree( d_extract );

	cudaFreeArray( d_tile2 );

}

Kernel :

__global__ void extraction(float *d_out, int dimOut, int im1offsetX, int im1offsetY, int im2offsetX, int im2offsetY, float ax, float ay, float bx, float by, float cx, float cy) {

	int ix		= IMUL(blockDim.x, blockIdx.x) + threadIdx.x + im1offsetX;

	int iy		= IMUL(blockDim.y, blockIdx.y) + threadIdx.y + im1offsetY;

	float xp	= ax*ix + bx*iy + cx  + .5 - im2offsetX;	// +.5 => Correction de l'emplacement du pixel sur GPU

	float yp	= ay*ix + by*iy + cy  + .5 - im2offsetY;

	d_out[iy*dimOut + ix] = tex2D(getDeviceTexture(), xp, yp);

}

void extractImg(float *d_extract, coord2D extractSize, coord2D tile1Offset, coord2D tile2Offset, polynomialParam parameters, dim3 gridSize, dim3 blockSize)

{

	float ax = parameters.ax;

	float ay = parameters.ay;

	float bx = parameters.bx;

	float by = parameters.by;

	float cx = parameters.cx;

	float cy = parameters.cy;

	printf("\nOffset image1 : %d, %d\t Offset image2 : %d, %d\n\n", tile1Offset.x, tile1Offset.y, tile2Offset.x, tile2Offset.y);

	extraction<<< gridSize, blockSize >>>(d_extract, extractSize.x, tile1Offset.x, tile1Offset.y, tile2Offset.x, tile2Offset.y, ax, ay, bx, by, cx, cy);

	cudaThreadSynchronize();

}

Thanks for your help.

I add 2 pictures :

On the left side, you can see the first picture binded with texture. On the right side the picture binded in the second iterration. You can see that the first line of the second picture is the same than the first line of the first picture.

Someone have any idea?

Hello,

No one have an idea?

I test on a mac pro with cuda 2.2 and it’s the same thing. Is it a driver bug?

Do you have a small compilable test case that illustrates the problem?

N.

I’m so noob…

global void extraction(float *d_out, int dimOut, int im1offsetX, int im1offsetY, int im2offsetX, int im2offsetY, float ax, float ay, float bx, float by, float cx, float cy) {

int ix		= IMUL(blockDim.x, blockIdx.x) + threadIdx.x + im1offsetX;
int iy		= IMUL(blockDim.y, blockIdx.y) + threadIdx.y + im1offsetY;

float xp	= ax*ix + bx*iy + cx  + .5 - im2offsetX;	// +.5 => Correction de l'emplacement du pixel sur GPU
float yp	= ay*ix + by*iy + cy  + .5 - im2offsetY;

d_out[[b]([/b]iy[b]-im1offsetY)[/b]*dimOut + [b]([/b]ix[b]-im1offsetX)[/b]] = tex2D(getDeviceTexture(), xp, yp);

}

Thanks for help, you open my eyes :D