Well, i’m new to cuda and testing with concurrent kernel execution! But for know my results are insatisfatory…
As a simple example i have created a kernel that does the following!!!
__global__ void Teste(int *A)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ int kk[nelement];
kk[threadIdx.x] = idx*idx;
for(int i=0;i<nelement;i++)
kk[threadIdx.x] = idx*idx;
__syncthreads(); // it doesn't matter it is only to keep it a little busy
A[idx] = kk[threadIdx.x];
}
after this i’ve created n streams where n is variable and depends on the test purpose.
So the host code for this is…
#include "cuda.h"
#include "stdio.h"
#define nKern 1
#define nelement 32
int *a_d;
int *a_h;
cudaStream_t *streams = (cudaStream_t*) malloc((nKern+1) * sizeof(cudaStream_t));
cudaMallocHost((void **)&a_h,nKern*sizeof(int)*nelement);
cudaMalloc((void **)&a_d, sizeof(int) * nelement * nKern);
for(int i = 0; i < nKern+1; i++)
cudaStreamCreate(&(streams[i]));
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event) ;
cudaEventCreate(&stop_event) ;
cudaEventRecord(start_event, 0);
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t*) malloc(nKern * sizeof(cudaEvent_t));
for(int i = 0; i < nKern; i++)
cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);
for(int i=0; i < nKern; i++ )
{
Teste <<< 1,nelement, 0, streams[i] >>> (&a_d[i*nelement]);
cudaEventRecord(kernelEvent[i], streams[i]);
cudaStreamWaitEvent(streams[nKern], kernelEvent[i],0);
}
float elapsed_time;
cudaMemcpyAsync(a_h,a_d,nKern*sizeof(int)*nelement,cudaMemcpyDeviceToHost, streams[nKern]);
cudaEventRecord(stop_event, 0) ;
cudaEventSynchronize(stop_event) ;
cudaEventElapsedTime(&elapsed_time, start_event, stop_event) ;
for(int i=0; i< nelement*nKern;i++)
printf(" %d \n",a_h[i] );
for(int i = 0; i < nKern; i++) {
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
printf("%f ms \n", elapsed_time);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a_h);
cudaFree(a_d);
free(streams);
cudaThreadExit();
One thing that i don’t understand is why when setting different nKern’s (1…15) (i have a GTX480 in Opensuse 11.3 (64 bit)) i get different execution times…
So i will aprecciate if anyone could explain why i don’t get the same execution time for nKern = 1 and nKern = 2…15?
best regards
Bruno Faria