Hi all,
I am working on concurrent kernel now.
I tried a program to test concurrent kernel on my desktop(winXp, VS2008, CUDA 3.2, GTX 570).
But I don’t see much speedup.
Can anyone give me some advice?
Thank you very much.
The code is as followed:
#include <stdio.h>
#include <cuda.h>
#include <cuPrintf.cu>
#include “cutil_inline.h”
#include “cuda_runtime.h”
#include
#include <cutil.h>
#include “cudaHelper.h”
using namespace std;
global void mykernel( int *a, int n )
{
int idx = threadIdx.x;
int value = 1;
for(int i=0; i<n; i++)
value *= sin( (float)i ) + tan( (float)i );
a[idx] = value;
}
int main(int argc, const char **argv)
{
int nblocks = 4;
int nthreads = 64;
int n = 500000;
int nkernels = 8;
int nbytes;
int devID;
cudaDeviceProp deviceProps;
int * d_A=0;
int * d_B=0;
cudaStream_t *stream;
cudaEvent_t start, stop;
float elapsedTime;
int qatest = 0;
printf("[concurrentKernels] - Starting...\n\n");
devID = 0;
cutilSafeCall(cudaSetDevice(devID));
// QA testing mode
if (cutCheckCmdLineFlag(argc, (const char**)argv, "qatest"))
{
qatest = 1;
}
cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA Device %s has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
printf("CUDA Device %s is%s capable of concurrent kernel execution\n", deviceProps.name, (deviceProps.concurrentKernels==0)?" NOT":"");
stream = (cudaStream_t *)malloc(nkernels * sizeof(cudaStream_t));
//create streams
for(int i=0; i<nkernels; i++)
{
cutilSafeCall(cudaStreamCreate(&stream[i]));
}
// note: in this sample we will repeatedly overwrite the same
// block of device mem, but that's okay because we don't really
// care about the output of the kernel for the purposes of this
// example.
nbytes = nkernels * nthreads * sizeof(int);
cutilSafeCall(cudaMalloc((void **)&d_A, nbytes));
cutilSafeCall(cudaMalloc((void **)&d_B, nbytes));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
// start timer then launch all kernels in their streams
cutilSafeCall(cudaEventRecord(start, 0));
for(int i=0; i<nkernels; i++)
{
// avoid synchronization points (events, error checks, etc.) inside
// this loop in order to get concurrent execution on devices that support it
mykernel<<<nblocks, nthreads, 0, stream[i]>>>(&d_A[i*nthreads], n);
}
cutilSafeCall(cudaEventRecord(stop, 0));
// wait for all streams to finish
cutilSafeCall(cudaEventSynchronize(stop));
// get total time for all kernels
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("\nAll %d kernels together took %.3fs\n", nkernels, elapsedTime/1000.f);
cudaEvent_t start1, stop1;
float elapsedTime1 = 0.0;
cutilSafeCall(cudaEventCreate(&start1));
cutilSafeCall(cudaEventCreate(&stop1));
// check time to execute a single iteration
cutilSafeCall(cudaEventRecord(start1, 0));
mykernel<<<nblocks, nthreads>>>(d_B,n);
cutilCheckMsg("kernel launch failure");
cutilSafeCall(cudaEventRecord(stop1, 0));
cutilSafeCall(cudaEventSynchronize(stop1));
cutilSafeCall(cudaEventElapsedTime(&elapsedTime1, start1, stop1));
printf("if no concurrent execution the time is %.3fs\n",elapsedTime1/1000.f);
// cleanup
printf("\nCleaning up...\n");
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (stream)
{
for(int i=0; i<nkernels; i++)
{
cutilSafeCall(cudaStreamDestroy(stream[i]));
}
free(stream);
}
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (qatest)
{
// any errors that might have happened will have already been reported
printf("[concurrentKernels] - Test Results:\nPASSED\n");
}
exit(0);
return 0;
}