Hello everyone,
The following code performs trilinear interpolation using 2d texture memory. The memory initialized is random and the co-ordinates to be looked up are also randomly generated.
These co-ordinates are stored in a two dimensional array and every thread picks up the relevant (as per its idx) co-ordinates and performs two bilinear interpolations followed a linear interpolation in order to mimic a tri-linear interpolation.
The 3D lookup table of 50X50X50 float values is faked as two dimensional memory (50 X 2500) for storing in texture 2-d memory.
I have the following questions:
-
Being a newbie in CUDA, i am worried about the efficiency of this code. Any suggestions would be most welcome.
-
I am using 1D threads in blocks and 1D blocks in grid. I am not sure why I should use the 2d or 3d capability for indexing threads and blocks. Would it be to aid my accessing texture memory when I try to use the full texture memory of 2048X1024 float values?
-
Since the lookups are independent, would a __syncthreads() be useful in the device code?
-
Is there a smarter way to perform independent 3d lookups in CUDA?
Thanks,
kpg
#header file:##########################################
#ifndef LOOKUP_H
#define LOOKUP_H
// Thread block size
#define BLOCK_SIZE 16
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA (1)
#define HA (24576)
#endif // MATRIXMUL_H
##########################################
#host code #############################################
void
runTest( int argc, char** argv)
{
CUT_DEVICE_INIT();
// set seed for rand()
srand(2006);
// allocate host memory for matrices of xi-yj-zk A
unsigned int size_A = WA * HA; // WA = 1, HA = num of lookups : 64
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
// initialize lookup co-ordinates
randomInit(h_A, size_A);
// allocate device memory
float* d_A;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A,
cudaMemcpyHostToDevice) );
// allocate device memory for result
unsigned int d_size = 32*32;
float* d_data = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data, d_size));
unsigned int width, height;
width = 50;
height = 2500;
unsigned int size = width * height * sizeof(float);
float* h_data = (float*) malloc(size);
// initialize host memory
randomInitnorm(h_data, width*height);
// load table from disk
unsigned int timer_copy = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer_copy));
CUT_SAFE_CALL( cutStartTimer( timer_copy));
// Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
CUDA_SAFE_CALL( cudaMallocArray( &cu_array, &channelDesc, width, height ));
CUDA_SAFE_CALL( cudaMemcpyToArray( cu_array, 0, 0, h_data, size, cudaMemcpyHostToDevice));
// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true; // access with normalized texture coordinates
// Bind the array to the texture
CUDA_SAFE_CALL( cudaBindTextureToArray( tex, cu_array, channelDesc));
CUT_SAFE_CALL( cutStopTimer( timer_copy ));
printf("Loading, Transfering and Binding time: %f (ms)\n", cutGetTimerValue( timer_copy ));
CUT_SAFE_CALL( cutDeleteTimer( timer_copy ));
dim3 dimBlock(512, 1, 1);
//dim3 dimGrid(width / dimBlock.x, 1, 1);
dim3 dimGrid(16, 1, 1);
// warmup
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_A, d_data, width, height, angle);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
// execute the kernel
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_A, d_data, width, height, angle);
// check if kernel execution generated an error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// allocate mem for the result on host side
float* h_odata = (float*) malloc( d_size);
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_data, d_size, cudaMemcpyDeviceToHost) );
//printMatrix(h_odata, size);
// cleanup memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_data));
CUDA_SAFE_CALL(cudaFreeArray(cu_array));
free(h_data);
free(h_odata);
free(h_A);
}
// Prints a matrix with float entries.
void printMatrix(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i <= size; ++i)
{
printf(“%f\n”, data[i]);
}
}
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i < size; ++i)
{
data[i] = 50 * (rand() / (float)RAND_MAX);
//printf("%f\n", data[i]);
}
}
// Allocates a matrix with random normalized float entries.
void randomInitnorm(float* data, int size)
{
printf(“A contains:\n”);
for (int i = 0; i<size; ++i)
{
data[i] = (rand() / (float)RAND_MAX);
//printf("%f\n", data[i]);
}
}
#################################################################
#device code ######################################################
global void
transformKernel(float* A, float* g_odata, int width, int height, float theta)
{
// calculate normalized texture coordinates
unsigned int x = blockIdx.xblockDim.x + threadIdx.x;
//unsigned int y = blockIdx.yblockDim.y + threadIdx.y;
__shared__ float X;
__shared__ float Y;
__shared__ float Z;
int Z1;
float VAL1;
float VAL2;
float a;
float VAL;
X = A[3*x];
Y = A[3*x + 1];
Z = A[3*x + 2];
Z1 = floorf(Z);
//printf("X %f ", X);
//printf("Y %f ", Y);
//printf("Z %f\n", Z);
//__syncthreads();
// read from texture and write to global memory
VAL1 = tex2D(tex, X, (Y + 50*Z1));
VAL2 = tex2D(tex, X, (Y + 50*(Z1+1)));
a = Z - Z1;
VAL = (1-a)*VAL1 + a*VAL2;
g_odata[x] = VAL;
//printf("Value %f\n", VAL);
//g_odata[y*width + x] = tex2D(tex, X, Y);
}
#endif // #ifndef SIMPLETEXTURE_KERNEL_H
#####################################################################