The Code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <windows.h>
// includes, project
#include <C:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK\common\inc\cutil.h>
#define ITERATIONS 100
// ZZZZ - temp - need to figure out a way to render into a texture
float4* d_data = NULL;
// declare texture reference for 2D float texture
texture<float4, 2, cudaReadModeElementType> Tex1;
texture<float4, 2, cudaReadModeElementType> Tex2;
texture<float4, 2, cudaReadModeElementType> Tex3;
global
void ProcessTexs( float4* ThisTex, int VolResX, int VolResY, int Iters )
{
int MyX = blockIdx.xblockDim.x + threadIdx.x;
int MyY = blockIdx.yblockDim.y + threadIdx.y;
// no data movement
// no computation
}
int
main( int argc, char** argv) {
int i, Iters;
CUT_DEVICE_INIT();
// the textures are all float4
cudaChannelFormatDesc ChannelDesc = cudaCreateChannelDesc<float4>();
// set texture parameters for all the textures
Tex1.addressMode[0] = cudaAddressModeClamp;
Tex1.addressMode[1] = cudaAddressModeClamp;
Tex1.filterMode = cudaFilterModePoint;
Tex1.normalized = false; // access with normalized texture coordinates
Tex2.addressMode[0] = cudaAddressModeClamp;
Tex2.addressMode[1] = cudaAddressModeClamp;
Tex2.filterMode = cudaFilterModePoint;
Tex2.normalized = false; // access with normalized texture coordinates
Tex3.addressMode[0] = cudaAddressModeClamp;
Tex3.addressMode[1] = cudaAddressModeClamp;
Tex3.filterMode = cudaFilterModePoint;
Tex3.normalized = false; // access with normalized texture coordinates
// allocate memory on the board for the textures to process
// a bunch of pointers to CUDA arrays
cudaArray **SrcTextures = (cudaArray **)malloc(66 * sizeof(cudaArray *));
// allocate each for the CUDA arrays
for (i=0; i<66; i++) {
CUDA_SAFE_CALL( cudaMallocArray( &SrcTextures[i], &ChannelDesc, 256, 256 ));
}
// allocate device memory for result
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_data, 256*256*sizeof(float4)));
// make sure Windows isn't time-multiplexing the GPU during this run
Sleep(500);
dim3 DimGrid(32, 32, 1);
dim3 DimBlock(8, 8, 1);
// make sure there isn't anyhting else going on
CUDA_SAFE_CALL( cudaThreadSynchronize() );
// time the iterations
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
for (Iters = 0; Iters < ITERATIONS; Iters++) {
for (i=0; i<64; i++) {
CUDA_SAFE_CALL( cudaBindTextureToArray( Tex1, SrcTextures[i], ChannelDesc));
CUDA_SAFE_CALL( cudaBindTextureToArray( Tex2, SrcTextures[i+1], ChannelDesc));
CUDA_SAFE_CALL( cudaBindTextureToArray( Tex3, SrcTextures[i+2], ChannelDesc));
ProcessTexs<<< DimGrid, DimBlock, 0 >>>(d_data, 256, 256, Iters);
// copy the results into the texture for the next iteration
CUDA_SAFE_CALL( cudaMemcpyToArray( SrcTextures[i+2],
0,0,
d_data,
256*256*sizeof(float4),
cudaMemcpyDeviceToDevice) );
} // textures
} // iterations
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
float TimeInMs = cutGetTimerValue( timer);
printf( "Processing time per batch: %f (ms)\n", TimeInMs / 100.0f);
CUT_SAFE_CALL( cutDeleteTimer( timer));
cudaFree(d_data);
for (i=0; i<66; i++) {
cudaFreeArray(SrcTextures[i]);
}
CUT_EXIT(argc, argv);
}
This most closely mimics the DirectX implementation, but the CUDA GPU kernel is not doing any data movement nor any of the computation necessary to compute the results that the DirectX implementation is.
The DirectX implementation does all the data movement (complex reads, simple writes, about 40 lines of compute) for this data set in 8.84 ms on the same hardware.
The hardware:
3.4 GHz Pentium D, 4GB, 8800 GTX
The Performance:
Binds + Process = 7.02 ms
79% of the time DX takes to solve the entire problem is just CUDA overhead
But with CUDA I can’t render into a texture, so I have to copy the results back into the texture:
Binds + Process + Copy: 24.3 ms
2.75 longer than the DX implementation and I still haven’t moved any data or performed any computation in the kernel.
Grid and block size have practically no effect on how long a call to Process takes.
Copying 64 MB of data in (24.3-7.02 ms) is 3.7 GB/sec, and this is a device to device copy.