Hello,
Consider an example data processing sequence:
-
transfer of data from host to device
-
weighing and rearranging of these data by a kernel
-
60x8x8 batch of FFTs, 512 long each, on weighted data
I have implemented this sequence and found that executing of the sequence takes longer than the sum of the execution time of all individual parts.
To this end I run the 100 sequences in a loop, with parts commented out as needed, and measure the executing time and devide it by 100 (code attached). The results are:
-
7.5MB host to device transfer only
2.67436ms which corresponds roughly to 2.7GB/s, measured with the bandwidth test [this goes up to 5.4GB/s if the Tesla card is the only PCIe device in the PC]
-
kernel execution only, 512x60x8x8 times something like weighted[target].x = (float)(samples[source].x) * w[k]
0.002020ms = impressive 2.02us
-
batch of FFTs only, 60x8x8 FFTs, 512 long each
0.00311ms= impressive 3.11us
2+3: 0.0049ms= 4.9us, roughly the sum of both parts
1+2: 5.14ms, twice than the sum of the components
1+3: 4.05ms, more than the sum of the components
1+2+3: 6.26ms, more than twice the sum of the components
The data transfer takes only 2.6ms by itself; when the copy Command is followed by a kernel execution or by an FFT, then the latter have to wait for something, as the cudaMemcpy is asynchronous?
What can I do to make correct measurements?
What can I do to improve performance of the sequence?
Kind regards,
peter
[codebox]// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil_inline.h>
#define CYCLES 100
#define SAMPLES 512
#define RANGES 60
#define ROWS 8
#define COLS 8
typedef short2 ShortComplex;
typedef float2 FloatComplex;
global void
samplesKernel(ShortComplex* samples, FloatComplex* weighted, float* w)
{
int k = blockIdx.x;//sample time
int h = blockIdx.y;//range index
int i = threadIdx.x;//row
int j = threadIdx.y;//columns
unsigned int source = kRANGESCOLSROWS + iRANGESCOLS + jRANGES + h;
unsigned int target = h*SAMPLES*COLS*ROWS + i*SAMPLES*COLS + j*SAMPLES + k;
weighted[target].x = (float)(samples[source].x) * w[k];
weighted[target].y = (float)(samples[source].y) * w[k];
__syncthreads();
}
void runTest(int argc, char** argv)
{
unsigned int length = SAMPLES * RANGES * ROWS * COLS;
unsigned int shortSize = sizeof(ShortComplex) * length;
unsigned int floatSize = sizeof(FloatComplex) * length;
dim3 threads(ROWS, COLS);
dim3 grid(SAMPLES, RANGES);
//set device to the best and only
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate host memory for the length/2 samples
ShortComplex* source = (ShortComplex*)malloc(shortSize);
// Allocate host memory for sample weighing function
float* wk = (float*)malloc(SAMPLES*sizeof(float));
// Initialise the memory for the length/2 samples
for (unsigned int i = 0; i < length; ++i)
{
source[i].x = (short)(rand() / (float)RAND_MAX * 32768.0f);
source[i].y = 0;
}
// Initialise the memory for sample weighing function
for (unsigned int i = 0; i < SAMPLES; ++i)
{
wk[i] = 1.f;
}
ShortComplex* samples;
cutilSafeCall(cudaMalloc((void**)&samples, shortSize));
FloatComplex* weighted;
cutilSafeCall(cudaMalloc((void**)&weighted, floatSize));
FloatComplex* spectrum;
cutilSafeCall(cudaMalloc((void**)&spectrum, floatSize));
// Copy host memory to device
cutilSafeCall(cudaMemcpy(samples, source, shortSize, cudaMemcpyHostToDevice));
// populate weighted data
samplesKernel<<< grid, threads >>>(samples, weighted, wk);
// CUFFT plan
cufftHandle planSamples;
cufftSafeCall(cufftPlan1d(&planSamples, SAMPLES, CUFFT_C2C, RANGES * ROWS * COLS));
// create and start timer
unsigned int timer = 0;
cutilCheckError(cutCreateTimer(&timer));
cutilCheckError(cutStartTimer(timer));
for (int i=0;i<CYCLES;++i)
{
//copy another half set of samples
cutilSafeCall(cudaMemcpy(samples, source, shortSize, cudaMemcpyHostToDevice));
// execute the input sample windowing kernel producing weighted
samplesKernel<<< grid, threads >>>(samples, weighted, wk);
//carry out ffts on weighed input samples producing spectrum
cufftSafeCall(cufftExecC2C(planSamples, (cufftComplex *)weighted, (cufftComplex *)spectrum, CUFFT_FORWARD));
}
float time = cutGetTimerValue(timer)/((float)CYCLES);
printf("Average Processing time: %f (ms) \n", time);
// stop and destroy timer
cutilCheckError(cutStopTimer(timer));
cutilCheckError(cutDeleteTimer(timer));
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// cleanup host memory
free(source);
free(wk);
// cleanup device memory
cutilSafeCall(cudaFree(samples));
cutilSafeCall(cudaFree(weighted));
cutilSafeCall(cudaFree(spectrum));
cudaThreadExit();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
runTest(argc, argv);
cutilExit(argc, argv);
}
[/codebox]