hi all again,
I managed to make some modifications to my kernel in order to use textures, but I didn’t notice any improvement.
The original image is allocated as uchar1. Later I declared a cuda array (for texture use) and copied the data (image) from image to cuArray and finnaly I bound the cuda array with the texture reference texRef. image_out is the image linearized.
My question is that if I should notice an improvement against the kernel using regular global memory access or not. And I am not completely sure I understood texture management correctly. Did I use it correctly?
If anyone could run this new kernel on a 1.3 compute capability to see if the bandwidth has increase I would really appreciate.
Thank you.
[codebox]#define BLOCK_SIZE 16
texture< uchar4, 2, cudaReadModeElementType > texRef;
global void binarizeImageGPUKernel( uchar4 *image_out, size_t devicePitch, unsigned width,
unsigned height, unsigned char threshold )
{
unsigned idx = threadIdx.x + blockDim.x * blockIdx.x;
unsigned idy = threadIdx.y + blockDim.y * blockIdx.y;
if( idx < width/4 && idy < height )
{
uchar4 pixel4 = tex2D( texRef, idx, idy );
uchar4 *pixel4_out = (uchar4 *)((char *)image_out + devicePitch*idy);
pixel4.x = ( pixel4.x < threshold ) ? 0 : 255;
pixel4.y = ( pixel4.y < threshold ) ? 0 : 255;
pixel4.z = ( pixel4.z < threshold ) ? 0 : 255;
pixel4.w = ( pixel4.w < threshold ) ? 0 : 255;
pixel4_out[idx] = pixel4;
}
}
void binarizeImageGPU( uchar1 *image, unsigned width,
unsigned height, unsigned hostPitch, unsigned char threshold )
{
unsigned widthInBytes = width*sizeof(uchar1);
float elapsedTime;
size_t devicePitch;
uchar4 *image_out = NULL;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 blockSize( BLOCK_SIZE, BLOCK_SIZE, 1 );
dim3 gridSize( ((width + BLOCK_SIZE - 1)/BLOCK_SIZE)/4, ((height + BLOCK_SIZE - 1)/BLOCK_SIZE), 1 );
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8,
cudaChannelFormatKindUnsigned);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, widthInBytes, height);
cudaMemcpy2DToArray(cuArray, 0, 0, image, hostPitch,
widthInBytes, height, cudaMemcpyHostToDevice);
texRef.addressMode[0] = cudaAddressModeClamp;
texRef.addressMode[1] = cudaAddressModeClamp;
texRef.filterMode = cudaFilterModePoint;
texRef.normalized = false;
cudaBindTextureToArray(texRef, cuArray, channelDesc);
cudaMallocPitch( (void **)&image_out, &devicePitch, widthInBytes, height );
cudaEventRecord(start, 0);
binarizeImageGPUKernel<<< gridSize, blockSize >>>( image_out, devicePitch, width, height, threshold );
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaMemcpy2D( image, hostPitch, image_out, devicePitch,
widthInBytes, height, cudaMemcpyDeviceToHost);
printf( "Tempo de execucao na GPU sem otimizacao do compilador CUDA /Od: %f ms\n", elapsedTime );
cudaFreeArray(cuArray);
cudaFree( image_out );
cudaThreadExit();
}[/codebox]