I test my program on GTX 280,win 7 with cuda 3.2
This is my test code,it’s a little long:
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <Windows.h>
#include <process.h>
HANDLE signal;
void _cdecl test(void *param);
unsigned int host[2];
unsigned int num=50000000;
unsigned int memsize=numsizeof(int);
unsigned int t=500;//num/t equals to the total thread num in kernal
global void
kernal(unsigned int *input,unsigned int output)
{
unsigned int tid=blockDim.xblockIdx.x+threadIdx.x;
unsigned int temp;
unsigned int p;
p=input[tid];
for(int i=0;i<1000;i++)
{
temp+=input[p];
p=input[p];
}
output[tid]=temp;
}
void mykernal(unsigned int *input,unsigned int *output,unsigned int num,cudaStream_t &stream)
{
unsigned int blockNum=(num+255)/256;
unsigned int threadNum=256;
kernal<<<blockNum,threadNum,0,stream>>>(input,output);
}
int main()
{
unsigned int deviceId[2]={0,1};
signal=CreateSemaphore(NULL,0,2,NULL);
cutilSafeCall(cudaHostAlloc(&host[0],memsize,cudaHostAllocPortable));
_beginthread(test,0,&deviceId[0]);
WaitForSingleObject(signal,INFINITE);
}
void _cdecl test(void param)
{
unsigned int deviceId=(unsigned int *)param;
cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));
cutilSafeCall(cudaSetDevice(deviceId));
unsigned int *input1=NULL,*output=NULL;
cutilSafeCall(cudaMalloc(&input1,memsize));
cutilSafeCall(cudaMalloc(&output,memsize));
cudaStream_t streamForCalc,streamForTransfer;
cutilSafeCall(cudaStreamCreate(&streamForCalc));
cutilSafeCall(cudaStreamCreate(&streamForTransfer));
unsigned int timer;
cutCreateTimer(&timer);
cutResetTimer(timer);
cudaEvent_t eventForCalc,eventForTransfer;
//each element(index<num/t)point to another element in input1 array
unsigned int *h=new unsigned int[num];
srand(4);
int temp;
for(int i=0;i<num/t;i++)
{
temp=rand();
temp=temp>0?temp:-temp;
h[i]=temp*73457%(num/t);
if(h[i]<0) h[i]*=-1;
h[i]=h[i]%(num/t);
if(h[i]>=(num/t) || h[i]<0)
printf("error %d\n",h[i]);
}
cutilSafeCall(cudaMemcpy(input1,h,(num/t)*sizeof(unsigned int),cudaMemcpyHostToDevice));
delete[] h;
//Create cudaEvent
cutilSafeCall(cudaEventCreateWithFlags(&eventForCalc,cudaEventBlockingSync));
cutilSafeCall(cudaEventCreateWithFlags(&eventForTransfer,cudaEventBlockingSync));
//open the log file
char recordbuffer[128];
FILE *logfile;
sprintf(recordbuffer,"GPU_%d_all_eventSync.txt",deviceId);
logfile=fopen(recordbuffer,"w");
for(int i=0;i<10;i++)
{
fprintf(logfile,"%d",i);
cutStartTimer(timer);
//only data transfer
//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
//cutilSafeCall(cudaThreadSynchronize());
//cutStopTimer(timer);
//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
//cutResetTimer(timer);
//only kernal launched
//mykernal(input1,output,num/t,streamForCalc);
//cutilSafeCall(cudaThreadSynchronize());
//cutStopTimer(timer);
//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
//cutResetTimer(timer);
//kernal launched meanwhile data transfer,use cudaThreadSynchronize to synchronize
//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
//mykernal(input1,output,num/t,streamForCalc);
//cutilSafeCall(cudaThreadSynchronize());
//cutStopTimer(timer);
//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
//cutResetTimer(timer);
//kernal launched meanwhile data transfer,use cudaStreamSynchronize to synchronize
//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
//mykernal(input1,output,num/t,streamForCalc);
//cutilSafeCall(cudaStreamSynchronize(streamForTransfer));
//cutilSafeCall(cudaStreamSynchronize(streamForCalc));
//cutStopTimer(timer);
//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
//cutResetTimer(timer);
//kernal launched meanwhile data transfer,use cudaEventRecord to synchronize
mykernal(input1,output,num/t,streamForCalc);
cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
cutilSafeCall(cudaEventRecord(eventForCalc,streamForCalc));
cutilSafeCall(cudaEventSynchronize(eventForCalc));
cutStopTimer(timer);
fprintf(logfile,",%f",cutGetTimerValue(timer));
cutStartTimer(timer);
cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer));
cutilSafeCall(cudaEventSynchronize(eventForTransfer));
cutStopTimer(timer);
fprintf(logfile,",%f",cutGetTimerValue(timer));
cutResetTimer(timer);
fprintf(logfile,"\n");
}
cutilSafeCall(cudaFreeHost(host[deviceId]));
cutilSafeCall(cudaFree(input1));
cutilSafeCall(cudaFree(output));
cutilSafeCall(cudaEventDestroy(eventForCalc));
cutilSafeCall(cudaEventDestroy(eventForTransfer));
cutilSafeCall(cudaStreamDestroy(streamForCalc));
cutilSafeCall(cudaStreamDestroy(streamForTransfer));
fclose(logfile);
ReleaseSemaphore(signal,1,NULL);
}//code end
my test result:
when only kernal is launched,the result is about 41 ms
when only cudaMemcpyAsync is called,the result is about 100 ms
then kernal is launched and meanwhile cudaMemcpyAsync is called
if(use cudaThreadSync to synchronize)
time is 100ms
if(use cudaStreamSync to synchronize)