I have a Tesla card in a dual dual-core processor workstation running Ubuntu Gutsy (7.10) and have just started CUDA programming.
I am implementing a signal enveloping algorithm for medical signal processing that requires transfer to and from the card of an integer array 6MB in size.
Based on results from the bandwidthTest.cu program, I do not seem to be getting the correct bandwidth on device-to-host or host-to-device memory transfers (I’m using page-locked memory). The numbers the bandwidthTest program returns for a 6 MB chunk of data are:
Device to Host:
Transfer Size (Bytes):6291456 Bandwidth(MB/s): 2509.5
Host to Device:
Transfer Size (Bytes): 6291456 Bandwidth(MB/s): 2913.8
The numbers I get from my program (I’ve included the code listing below) are:
Host to Device Copy Bandwidth=1304.915161(MB/s)
Device to Host Copy Bandwidth=194.603012(MB/s)
I’ve taken care to initialze all memory and cuFFT plans prior to any timing calls, so that memory allocation is not included in my timing values (I’ve included program output below also).
I’ve also looked over the source code in the file: bandwidthTest.cu. The only difference I can see between that program and mine is that the bandwidthTest passes data of type: unsigned char back and forth, while I am passing data of type: short int. I’ve searched the NVIDIA_CUDA_Programming_Guide_1.1.pdf file and haven’t found any clues there.
Am I neglecting to correctly set up card in some manner? Or are there differences between the two types that can slow the data transfer (e.g. variable alignment).
My application requires that I process 6MB chunks in less than 0.03s/chunk, so currently my program is just below its performance spec. Can any one see a way to improve the transfer rate so it approaches those returned by the bandwidthTest.cu program?
Thanks
Code listing…
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
#include <cuda_runtime_api.h>
// Complex data type
typedef float2 Complex;
static device host inline Complex ComplexAdd(Complex, Complex);
static device host inline Complex ComplexScale(Complex, float);
static device host inline Complex ComplexMul(Complex, Complex);
static global void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
static global void ComplexPointwiseMagAndScale(Complex*, const Complex*, int, float);
static global void Complexify(Complex* a, const short int* b, int size);
static global void DeComplexify(short int* a, const Complex* b, int size);
static Complex d_signal;
static Complex h_signal;
static Complex* h_convolved_signal;
static short int d_shortinteger_signal;
static short int h_shortinteger_signal;
static short int* d_shortinteger_convolved_signal;
static short int* h_shortinteger_convolved_signal;
static long cudaMemCopySize;
static Complex* d_filter_kernel;
static Complex* h_filter_kernel;
static cufftHandle plan;
int InitCUDAAnalyticSignal(long npts, long nlines)
{
long i;
cudaMemCopySize=sizeof(short int) * npts * nlines;
do {
int deviceCount;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0)
{
fprintf(stderr, “There is no device.\n”);
exit(EXIT_FAILURE);
}
else
fprintf(stderr,“Found CUDA Devices %d\n”,deviceCount);
int dev;
for (dev = 0; dev < deviceCount; ++dev)
{
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));
if (deviceProp.major >= 1)
break;
}
if (dev == deviceCount)
{
fprintf(stderr, “There is no device supporting CUDA.\n”);
exit(EXIT_FAILURE);
}
else
{
CUDA_SAFE_CALL(cudaSetDevice(dev));
fprintf(stderr,“Program Will Execute On CUDA Device %d\n”,dev);
}
} while (0);
CUFFT_SAFE_CALL(cufftPlan1d(&plan, npts, CUFFT_C2C, nlines););
h_signal = (Complex*)malloc(sizeof(Complex) * npts* nlines);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_signal, sizeof(Complex) * npts * nlines));
h_convolved_signal = (Complex*)malloc(sizeof(Complex) * npts* nlines);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_shortinteger_signal, sizeof(short int) * npts * nlines));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_shortinteger_convolved_signal, sizeof(short int) * npts * nlines));
//h_shortinteger_signal = (int*)malloc(sizeof(int) * npts* nlines);
//h_shortinteger_convolved_signal = (int*)malloc(sizeof(int) * npts* nlines);
cudaMallocHost( (void**)&h_shortinteger_signal,sizeof(short int) * npts* nlines);
cudaMallocHost((void**)& h_shortinteger_convolved_signal,sizeof(short int) * npts* nlines);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_filter_kernel, sizeof(Complex) * npts * nlines));
h_filter_kernel = (Complex*)malloc(sizeof(Complex) * npts* nlines);
// Initalize the memory for the filter
h_filter_kernel[0].x = 1.0;
h_filter_kernel[0].y = 0.0;
for (i = 1; i < npts/2; i++)
{
h_filter_kernel[i].x = 1.0;
h_filter_kernel[i].y = 1.0;
}
for (i = npts/2; i < npts; i++)
{
h_filter_kernel[i].x = 0.0;
h_filter_kernel[i].y = 0.0;
}
for(i=npts;i<npts * nlines;i++)
{
h_filter_kernel[i].x = h_filter_kernel[i%npts].x;
h_filter_kernel[i].y =h_filter_kernel[i%npts].y ;
}
// Copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_filter_kernel, h_filter_kernel, sizeof(Complex) * npts * nlines, cudaMemcpyHostToDevice));
return 1;
}
int CloseCUDAAnalyticSignal(void)
{
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_convolved_signal);
CUDA_SAFE_CALL(cudaFree(d_signal));
CUDA_SAFE_CALL(cudaFree(d_shortinteger_signal));
CUDA_SAFE_CALL(cudaFree(d_shortinteger_convolved_signal));
//free(h_shortinteger_signal);
//free(h_shortinteger_convolved_signal);
cudaFreeHost(h_shortinteger_signal);
cudaFreeHost(h_shortinteger_convolved_signal);
free(h_filter_kernel);
CUDA_SAFE_CALL(cudaFree(d_filter_kernel));
return 1;
}
int CUDAAnalyticSignal( short int *InputData,
short int *OutputData,
long npts,long nlines)
{
long i,j;
unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
float bandwidthInMBs = 0.0f;
CUT_SAFE_CALL( cutCreateTimer( &timer ) );
CUT_SAFE_CALL( cutStartTimer( timer));
// Copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_shortinteger_signal,InputData, cudaMemCopySize,cudaMemcpyHostToDevice));
Complexify<<<32, 256>>>(d_signal, d_shortinteger_signal, npts*nlines);
CUT_CHECK_ERROR("Kernel execution failed [ Complexify ]");// Check if kernel execution generated and error
//the the total elapsed time in ms
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
CUT_SAFE_CALL( cutResetTimer( timer));
//calculate bandwidth in MB/s
bandwidthInMBs = 1e3 * cudaMemCopySize / (elapsedTimeInMs * (float)(1024*1024));
fprintf(stderr,"Host to Card Copy Time for %ld MB =%f(ms)\n",cudaMemCopySize/(1024*1024), elapsedTimeInMs);
fprintf(stderr,"Host to Card Copy Bandwidth=%f(MB\/s)\n",bandwidthInMBs);
// Transform signal
CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, npts*nlines, 1.0f / npts);// Multiply the coefficients together and normalize the result
CUT_CHECK_ERROR("Kernel execution failed [ ComplexPointwiseMulAndScale ]");// Check if kernel execution generated and error
// Transform signal back
CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Compute its magnitude
ComplexPointwiseMagAndScale<<<32, 256>>>(d_signal, d_filter_kernel, npts*nlines, 1.0f / npts);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed [ ComplexPointwiseMagAndScale ]");
DeComplexify<<<32, 256>>>(d_shortinteger_convolved_signal, d_signal, npts*nlines);
CUT_CHECK_ERROR("Kernel execution failed [ DeComplexify ]");
CUT_SAFE_CALL( cutStartTimer( timer));
// Copy device memory to host
CUDA_SAFE_CALL(cudaMemcpy(OutputData,d_shortinteger_convolved_signal, cudaMemCopySize,cudaMemcpyDeviceToHost));
//note: Since Device to Host memcopies are blocking, there is no need
// for a cudaThreadSynchronize() here.
//get the the total elapsed time in ms
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
bandwidthInMBs = 1e3 * cudaMemCopySize / (elapsedTimeInMs * (float)(1024*1024));
fprintf(stderr,"Card to Host Copy Time for %ld MB =%f(ms)\n",cudaMemCopySize/(1024*1024), elapsedTimeInMs);
fprintf(stderr,"Card to Host Copy Bandwidth=%f(MB\/s)\n",bandwidthInMBs);
return 1;
}
////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////
// Complex addition
static device host inline Complex ComplexAdd(Complex a, Complex B)
{
Complex c;
c.x = a.x + b.x;
c.y = a.y + b.y;
return c;
}
// Complex scale
static device host inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
// Complex multiplication
static device host inline Complex ComplexMul(Complex a, Complex B)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static global void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
// Complex pointwise magnitude
static global void ComplexPointwiseMagAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i].x = sqrt((a[i].x)(a[i].x)+ (a[i].y)(a[i].y));
a[i].y = 0.0;
}
}
// Complexify
static global void Complexify(Complex* a, const short int* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i].x = (double)(b[i]);
a[i].y = 0.0;
}
}
// DeComplexify
static global void DeComplexify(short int* a, const Complex* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = (b[i].x);
}
}
My program output…
The times I’m getting with my CUDA program are:
[msh@hugherNaught] $ ./simpleCUFFT
About to call: InitCUDAAnalyticSignal()
Found CUDA Devices 1
Program Will Execute On CUDA Device 0
Completed call to: InitCUDAAnalyticSignal()
Host to Card Copy Time for 6 MB =4.598000(ms)
Host to Card Copy Bandwidth=1304.915161(MB/s)
Card to Host Copy Time for 6 MB =30.832001(ms)
Card to Host Copy Bandwidth=194.603012(MB/s)
CARD FFT execution time=0.040000