CUDA Newbie bandwidth question

msh · January 25, 2008, 5:46pm

I have a Tesla card in a dual dual-core processor workstation running Ubuntu Gutsy (7.10) and have just started CUDA programming.

I am implementing a signal enveloping algorithm for medical signal processing that requires transfer to and from the card of an integer array 6MB in size.

Based on results from the bandwidthTest.cu program, I do not seem to be getting the correct bandwidth on device-to-host or host-to-device memory transfers (I’m using page-locked memory). The numbers the bandwidthTest program returns for a 6 MB chunk of data are:

Device to Host:
Transfer Size (Bytes):6291456 Bandwidth(MB/s): 2509.5

Host to Device:
Transfer Size (Bytes): 6291456 Bandwidth(MB/s): 2913.8

The numbers I get from my program (I’ve included the code listing below) are:
Host to Device Copy Bandwidth=1304.915161(MB/s)

Device to Host Copy Bandwidth=194.603012(MB/s)

I’ve taken care to initialze all memory and cuFFT plans prior to any timing calls, so that memory allocation is not included in my timing values (I’ve included program output below also).

I’ve also looked over the source code in the file: bandwidthTest.cu. The only difference I can see between that program and mine is that the bandwidthTest passes data of type: unsigned char back and forth, while I am passing data of type: short int. I’ve searched the NVIDIA_CUDA_Programming_Guide_1.1.pdf file and haven’t found any clues there.

Am I neglecting to correctly set up card in some manner? Or are there differences between the two types that can slow the data transfer (e.g. variable alignment).

My application requires that I process 6MB chunks in less than 0.03s/chunk, so currently my program is just below its performance spec. Can any one see a way to improve the transfer rate so it approaches those returned by the bandwidthTest.cu program?

Thanks

Code listing…
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cufft.h>
#include <cutil.h>
#include <cuda_runtime_api.h>

// Complex data type
typedef float2 Complex;
static device host inline Complex ComplexAdd(Complex, Complex);
static device host inline Complex ComplexScale(Complex, float);
static device host inline Complex ComplexMul(Complex, Complex);
static global void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
static global void ComplexPointwiseMagAndScale(Complex*, const Complex*, int, float);

static global void Complexify(Complex* a, const short int* b, int size);
static global void DeComplexify(short int* a, const Complex* b, int size);

static Complex d_signal;
static Complex h_signal;
static Complex* h_convolved_signal;
static short int d_shortinteger_signal;
static short int h_shortinteger_signal;
static short int* d_shortinteger_convolved_signal;
static short int* h_shortinteger_convolved_signal;

static long cudaMemCopySize;
static Complex* d_filter_kernel;
static Complex* h_filter_kernel;
static cufftHandle plan;

int InitCUDAAnalyticSignal(long npts, long nlines)
{
long i;
cudaMemCopySize=sizeof(short int) * npts * nlines;
do {
int deviceCount;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0)
{
fprintf(stderr, “There is no device.\n”);
exit(EXIT_FAILURE);
}
else
fprintf(stderr,“Found CUDA Devices %d\n”,deviceCount);
int dev;
for (dev = 0; dev < deviceCount; ++dev)
{
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));
if (deviceProp.major >= 1)
break;
}
if (dev == deviceCount)
{
fprintf(stderr, “There is no device supporting CUDA.\n”);
exit(EXIT_FAILURE);
}
else
{
CUDA_SAFE_CALL(cudaSetDevice(dev));
fprintf(stderr,“Program Will Execute On CUDA Device %d\n”,dev);
}
} while (0);

CUFFT_SAFE_CALL(cufftPlan1d(&plan, npts, CUFFT_C2C, nlines););

h_signal = (Complex*)malloc(sizeof(Complex) * npts* nlines);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_signal, sizeof(Complex) * npts * nlines));
h_convolved_signal = (Complex*)malloc(sizeof(Complex) * npts* nlines);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_shortinteger_signal, sizeof(short int) * npts * nlines));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_shortinteger_convolved_signal, sizeof(short int) * npts * nlines));

//h_shortinteger_signal = (int*)malloc(sizeof(int) * npts* nlines);
//h_shortinteger_convolved_signal = (int*)malloc(sizeof(int) * npts* nlines);
cudaMallocHost( (void**)&h_shortinteger_signal,sizeof(short int) * npts* nlines);
cudaMallocHost((void**)& h_shortinteger_convolved_signal,sizeof(short int) * npts* nlines);

CUDA_SAFE_CALL(cudaMalloc((void**)&d_filter_kernel, sizeof(Complex) * npts * nlines));
h_filter_kernel = (Complex*)malloc(sizeof(Complex) * npts* nlines);

// Initalize the memory for the filter
h_filter_kernel[0].x = 1.0;
h_filter_kernel[0].y = 0.0;
for (i = 1; i < npts/2; i++) 
{
	h_filter_kernel[i].x = 1.0;
	h_filter_kernel[i].y = 1.0;
}
for (i = npts/2; i < npts; i++) 
{
	h_filter_kernel[i].x = 0.0;
	h_filter_kernel[i].y = 0.0;
}
for(i=npts;i<npts * nlines;i++)
{
	h_filter_kernel[i].x = h_filter_kernel[i%npts].x;
	h_filter_kernel[i].y =h_filter_kernel[i%npts].y ;
}
// Copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_filter_kernel, h_filter_kernel, sizeof(Complex) * npts * nlines, cudaMemcpyHostToDevice));

return 1;

}

int CloseCUDAAnalyticSignal(void)
{
//Destroy CUFFT context
CUFFT_SAFE_CALL(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_convolved_signal);
CUDA_SAFE_CALL(cudaFree(d_signal));
CUDA_SAFE_CALL(cudaFree(d_shortinteger_signal));
CUDA_SAFE_CALL(cudaFree(d_shortinteger_convolved_signal));

//free(h_shortinteger_signal);
//free(h_shortinteger_convolved_signal);
cudaFreeHost(h_shortinteger_signal);
cudaFreeHost(h_shortinteger_convolved_signal);

free(h_filter_kernel);
CUDA_SAFE_CALL(cudaFree(d_filter_kernel));

return 1;

}

int CUDAAnalyticSignal( short int *InputData,
short int *OutputData,
long npts,long nlines)
{
long i,j;

unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
float bandwidthInMBs = 0.0f;
CUT_SAFE_CALL( cutCreateTimer( &timer ) );
CUT_SAFE_CALL( cutStartTimer( timer));

// Copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_shortinteger_signal,InputData, cudaMemCopySize,cudaMemcpyHostToDevice));
Complexify<<<32, 256>>>(d_signal, d_shortinteger_signal, npts*nlines);
CUT_CHECK_ERROR("Kernel execution failed [ Complexify ]");// Check if kernel execution generated and error

//the the total elapsed time in ms
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
CUT_SAFE_CALL( cutResetTimer( timer));
//calculate bandwidth in MB/s
bandwidthInMBs = 1e3 * cudaMemCopySize / (elapsedTimeInMs * (float)(1024*1024));
fprintf(stderr,"Host to Card Copy Time for %ld MB =%f(ms)\n",cudaMemCopySize/(1024*1024), elapsedTimeInMs);
fprintf(stderr,"Host to Card Copy Bandwidth=%f(MB\/s)\n",bandwidthInMBs);

// Transform signal
CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));

ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, npts*nlines, 1.0f / npts);// Multiply the coefficients together and normalize the result	
CUT_CHECK_ERROR("Kernel execution failed [ ComplexPointwiseMulAndScale ]");// Check if kernel execution generated and error

// Transform signal back
CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));

// Compute its magnitude
ComplexPointwiseMagAndScale<<<32, 256>>>(d_signal, d_filter_kernel, npts*nlines, 1.0f / npts);
// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed [ ComplexPointwiseMagAndScale ]");

DeComplexify<<<32, 256>>>(d_shortinteger_convolved_signal, d_signal, npts*nlines);
CUT_CHECK_ERROR("Kernel execution failed [ DeComplexify ]");

CUT_SAFE_CALL( cutStartTimer( timer));
// Copy device memory to host
CUDA_SAFE_CALL(cudaMemcpy(OutputData,d_shortinteger_convolved_signal, cudaMemCopySize,cudaMemcpyDeviceToHost));
//note:  Since Device to Host memcopies are blocking, there is no need
//       for a cudaThreadSynchronize() here.

//get the the total elapsed time in ms
CUT_SAFE_CALL( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
bandwidthInMBs = 1e3 * cudaMemCopySize / (elapsedTimeInMs * (float)(1024*1024));
fprintf(stderr,"Card to Host Copy Time for %ld MB =%f(ms)\n",cudaMemCopySize/(1024*1024), elapsedTimeInMs);
fprintf(stderr,"Card to Host Copy Bandwidth=%f(MB\/s)\n",bandwidthInMBs);

return 1;

}

////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static device host inline Complex ComplexAdd(Complex a, Complex B)
{
Complex c;
c.x = a.x + b.x;
c.y = a.y + b.y;
return c;
}

// Complex scale
static device host inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}

// Complex multiplication
static device host inline Complex ComplexMul(Complex a, Complex B)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}

// Complex pointwise multiplication
static global void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}

// Complex pointwise magnitude
static global void ComplexPointwiseMagAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i].x = sqrt((a[i].x)(a[i].x)+ (a[i].y)(a[i].y));
a[i].y = 0.0;
}
}

// Complexify
static global void Complexify(Complex* a, const short int* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i].x = (double)(b[i]);
a[i].y = 0.0;
}
}

// DeComplexify
static global void DeComplexify(short int* a, const Complex* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i] = (b[i].x);
}
}

My program output…

The times I’m getting with my CUDA program are:
[msh@hugherNaught] $ ./simpleCUFFT
About to call: InitCUDAAnalyticSignal()
Found CUDA Devices 1
Program Will Execute On CUDA Device 0
Completed call to: InitCUDAAnalyticSignal()
Host to Card Copy Time for 6 MB =4.598000(ms)
Host to Card Copy Bandwidth=1304.915161(MB/s)
Card to Host Copy Time for 6 MB =30.832001(ms)
Card to Host Copy Bandwidth=194.603012(MB/s)
CARD FFT execution time=0.040000

Topic		Replies	Views
How to Implement Performance Metrics in CUDA C/C++ Technical Blog	20	851	March 11, 2020
Poor CUFFT Performance? Am I doing something wrong? CUDA Programming and Performance	15	15484	May 4, 2010
Inconsistent concurrent transfer speed CUDA Programming and Performance	21	1107	April 17, 2023
CUDA very slow performance CUDA Programming and Performance	21	16386	March 6, 2020
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
CUDA kernel for-loop performance CUDA Programming and Performance	16	5989	September 7, 2019
CUDA error, bandwithTest.exe CUDA Setup and Installation	12	2463	January 21, 2019
Question about bandwidth test CUDA Programming and Performance	8	295	April 2, 2024
cufftXt batch 1D GPU-Accelerated Libraries	12	2127	October 15, 2019
Multiple batches of 1D FFT using cuFFT GPU-Accelerated Libraries	10	4928	October 29, 2019

CUDA Newbie bandwidth question

Related topics