trilinear interpolation w/ 2d texture mem: sugg? I need some feedback on this code

Hello everyone,

The following code performs trilinear interpolation using 2d texture memory. The memory initialized is random and the co-ordinates to be looked up are also randomly generated.
These co-ordinates are stored in a two dimensional array and every thread picks up the relevant (as per its idx) co-ordinates and performs two bilinear interpolations followed a linear interpolation in order to mimic a tri-linear interpolation.

The 3D lookup table of 50X50X50 float values is faked as two dimensional memory (50 X 2500) for storing in texture 2-d memory.

I have the following questions:

  1. Being a newbie in CUDA, i am worried about the efficiency of this code. Any suggestions would be most welcome.

  2. I am using 1D threads in blocks and 1D blocks in grid. I am not sure why I should use the 2d or 3d capability for indexing threads and blocks. Would it be to aid my accessing texture memory when I try to use the full texture memory of 2048X1024 float values?

  3. Since the lookups are independent, would a __syncthreads() be useful in the device code?

  4. Is there a smarter way to perform independent 3d lookups in CUDA?

Thanks,
kpg

#header file:##########################################
#ifndef LOOKUP_H
#define LOOKUP_H

// Thread block size
#define BLOCK_SIZE 16

// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA (1)
#define HA (24576)

#endif // MATRIXMUL_H
##########################################

#host code #############################################
void
runTest( int argc, char** argv)
{
CUT_DEVICE_INIT();

// set seed for rand()		
srand(2006);

// allocate host memory for matrices of xi-yj-zk A 
unsigned int size_A = WA * HA; // WA = 1, HA = num of lookups : 64
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);

// initialize lookup co-ordinates
randomInit(h_A, size_A);

// allocate device memory
float* d_A;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));

// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A,
                          cudaMemcpyHostToDevice) );

// allocate device memory for result
unsigned int d_size = 32*32;
float* d_data = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data, d_size));

unsigned int width, height;
width = 50;
height = 2500;
unsigned int size = width * height * sizeof(float);	
float* h_data = (float*) malloc(size);
 
// initialize host memory
randomInitnorm(h_data, width*height);


// load table from disk
unsigned int timer_copy = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer_copy));
CUT_SAFE_CALL( cutStartTimer( timer_copy));

 // Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;


CUDA_SAFE_CALL( cudaMallocArray( &cu_array, &channelDesc, width, height )); 
CUDA_SAFE_CALL( cudaMemcpyToArray( cu_array, 0, 0, h_data, size, cudaMemcpyHostToDevice));



// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;    // access with normalized texture coordinates

// Bind the array to the texture
CUDA_SAFE_CALL( cudaBindTextureToArray( tex, cu_array, channelDesc));
CUT_SAFE_CALL( cutStopTimer( timer_copy ));
printf("Loading, Transfering and Binding time: %f (ms)\n", cutGetTimerValue( timer_copy ));
CUT_SAFE_CALL( cutDeleteTimer( timer_copy ));

dim3 dimBlock(512, 1, 1);

//dim3 dimGrid(width / dimBlock.x, 1, 1);
dim3 dimGrid(16, 1, 1);

// warmup
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_A, d_data, width, height, angle);

CUDA_SAFE_CALL( cudaThreadSynchronize() );
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

// execute the kernel
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_A, d_data, width, height, angle);

// check if kernel execution generated an error
CUT_CHECK_ERROR("Kernel execution failed");

CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));

// allocate mem for the result on host side
float* h_odata = (float*) malloc( d_size);
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_data, d_size, cudaMemcpyDeviceToHost) );
//printMatrix(h_odata, size);

// cleanup memory
CUDA_SAFE_CALL(cudaFree(d_A));	
CUDA_SAFE_CALL(cudaFree(d_data));
CUDA_SAFE_CALL(cudaFreeArray(cu_array));
free(h_data);
free(h_odata);
free(h_A);

}

// Prints a matrix with float entries.
void printMatrix(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i <= size; ++i)
{
printf("%f\n", data[i]);
}
}

// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i < size; ++i)
{

  data[i] = 50 * (rand() / (float)RAND_MAX);
      //printf("%f\n", data[i]);
}

}

// Allocates a matrix with random normalized float entries.
void randomInitnorm(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i<size; ++i)
{

  data[i] = (rand() / (float)RAND_MAX);
      //printf("%f\n", data[i]);
}

}
#################################################################

#device code ######################################################
global void
transformKernel(float* A, float* g_odata, int width, int height, float theta)
{
// calculate normalized texture coordinates
unsigned int x = blockIdx.xblockDim.x + threadIdx.x;
//unsigned int y = blockIdx.y
blockDim.y + threadIdx.y;

 __shared__ float X;
 __shared__ float Y;
 __shared__ float Z;


 int Z1;

 float VAL1;
 float VAL2;

 float a;
 float VAL;


 X = A[3*x];
 Y = A[3*x + 1];
 Z = A[3*x + 2];

 Z1 = floorf(Z);

 //printf("X %f ", X);
 //printf("Y %f ", Y);	
 //printf("Z %f\n", Z);


 //__syncthreads();


// read from texture and write to global memory
VAL1 = tex2D(tex, X, (Y + 50*Z1));
VAL2 = tex2D(tex, X, (Y + 50*(Z1+1)));

a = Z - Z1;
VAL = (1-a)*VAL1 + a*VAL2;

g_odata[x] = VAL;

//printf("Value %f\n", VAL);	
//g_odata[y*width + x] = tex2D(tex, X, Y);

}

#endif // #ifndef SIMPLETEXTURE_KERNEL_H
#####################################################################

  1. Why do you declare X, Y, Z shared? They seem local semantically. If you just want to save registers, you should declare X, Y and Z as volatile shared arrays, not just shared variables.
  2. You may need to pad each 50x50 block with an extra row to handle boundary case correctly.
  3. You can rearrange array A to get coalesced read. Currently your bottleneck may be the A read rather than the tex read.

Thanks for posting your code. What kind of performance are you getting now?