Why some synchronize function make cudaMemcpyAsync and kernal in different stream work in sequential

SparkHu · March 1, 2011, 10:47am

I test my program on GTX 280,win 7 with cuda 3.2
This is my test code,it’s a little long:
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <Windows.h>
#include <process.h>

HANDLE signal;
void _cdecl test(void *param);

unsigned int host[2];
unsigned int num=50000000;
unsigned int memsize=numsizeof(int);
unsigned int t=500;//num/t equals to the total thread num in kernal

global void
kernal(unsigned int *input,unsigned int output)
{
unsigned int tid=blockDim.xblockIdx.x+threadIdx.x;
unsigned int temp;
unsigned int p;
p=input[tid];
for(int i=0;i<1000;i++)
{
temp+=input[p];
p=input[p];
}
output[tid]=temp;
}
void mykernal(unsigned int *input,unsigned int *output,unsigned int num,cudaStream_t &stream)
{
unsigned int blockNum=(num+255)/256;
unsigned int threadNum=256;
kernal<<<blockNum,threadNum,0,stream>>>(input,output);
}
int main()
{
unsigned int deviceId[2]={0,1};
signal=CreateSemaphore(NULL,0,2,NULL);
cutilSafeCall(cudaHostAlloc(&host[0],memsize,cudaHostAllocPortable));
_beginthread(test,0,&deviceId[0]);
WaitForSingleObject(signal,INFINITE);
}
void _cdecl test(void param)
{
unsigned int deviceId=(unsigned int *)param;
cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));
cutilSafeCall(cudaSetDevice(deviceId));

unsigned int *input1=NULL,*output=NULL;

cutilSafeCall(cudaMalloc(&input1,memsize));
cutilSafeCall(cudaMalloc(&output,memsize));

cudaStream_t streamForCalc,streamForTransfer;
cutilSafeCall(cudaStreamCreate(&streamForCalc));
cutilSafeCall(cudaStreamCreate(&streamForTransfer));

unsigned int timer;

cutCreateTimer(&timer);
cutResetTimer(timer);
cudaEvent_t eventForCalc,eventForTransfer;
//each element(index<num/t)point to another element in input1 array
unsigned int *h=new unsigned int[num];
srand(4);
int temp;
for(int i=0;i<num/t;i++)
{
	temp=rand();
	temp=temp>0?temp:-temp;
	h[i]=temp*73457%(num/t);
	if(h[i]<0) h[i]*=-1;
	h[i]=h[i]%(num/t);
	if(h[i]>=(num/t) || h[i]<0)
		printf("error %d\n",h[i]);
}
cutilSafeCall(cudaMemcpy(input1,h,(num/t)*sizeof(unsigned int),cudaMemcpyHostToDevice));
delete[] h;
//Create cudaEvent
cutilSafeCall(cudaEventCreateWithFlags(&eventForCalc,cudaEventBlockingSync));
cutilSafeCall(cudaEventCreateWithFlags(&eventForTransfer,cudaEventBlockingSync));
//open the log file
char recordbuffer[128];
FILE *logfile;
sprintf(recordbuffer,"GPU_%d_all_eventSync.txt",deviceId);
logfile=fopen(recordbuffer,"w");

for(int i=0;i<10;i++)
{
	fprintf(logfile,"%d",i);
	cutStartTimer(timer);
	//only data transfer
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//only kernal launched
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transferï¼Œuse cudaThreadSynchronize to synchronize
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transferï¼Œuse cudaStreamSynchronize to synchronize
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaStreamSynchronize(streamForTransfer));
	//cutilSafeCall(cudaStreamSynchronize(streamForCalc));
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transferï¼Œuse cudaEventRecord to synchronize
	mykernal(input1,output,num/t,streamForCalc);
	cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	
	cutilSafeCall(cudaEventRecord(eventForCalc,streamForCalc));
	cutilSafeCall(cudaEventSynchronize(eventForCalc));
	cutStopTimer(timer);
	fprintf(logfile,",%f",cutGetTimerValue(timer));

	cutStartTimer(timer);

	cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer));	
	cutilSafeCall(cudaEventSynchronize(eventForTransfer));
	cutStopTimer(timer);
	fprintf(logfile,",%f",cutGetTimerValue(timer));

	cutResetTimer(timer);
	fprintf(logfile,"\n");
}
cutilSafeCall(cudaFreeHost(host[deviceId]));
cutilSafeCall(cudaFree(input1));
cutilSafeCall(cudaFree(output));
cutilSafeCall(cudaEventDestroy(eventForCalc));
cutilSafeCall(cudaEventDestroy(eventForTransfer));
cutilSafeCall(cudaStreamDestroy(streamForCalc));
cutilSafeCall(cudaStreamDestroy(streamForTransfer));
fclose(logfile);
ReleaseSemaphore(signal,1,NULL);

}//code end
my test result:
when only kernal is launched,the result is about 41 ms
when only cudaMemcpyAsync is called,the result is about 100 ms
then kernal is launched and meanwhile cudaMemcpyAsync is called
if(use cudaThreadSync to synchronize)
time is 100ms
if(use cudaStreamSync to synchronize)

SparkHu · March 1, 2011, 10:54am

time is 141ms//this is really strange
if(use cudaEventSync to synchronize)
{
if(cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer)); is called before cutilSafeCall(cudaEventSynchronize(eventForCalc))External Image
{
result is 40,100
}
if(cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer)); is called before cutilSafeCall(cudaEventSynchronize(eventForCalc))External Image
{
result is 40,140//this is really strange too
}
}
In addition,I use VS2008

SparkHu · March 1, 2011, 10:57am

sorry,I make a mistake,I use VS2010
Does this shows a bug,or there is something wrong in my configuration.
I am greatly appreciated if you answer my question

Topic		Replies	Views
Kernels launched by multiple host threads get serialized by cudaStreamSynchronize(0) when --default- CUDA Programming and Performance	7	3072	October 12, 2021
Is cudaMemcpyAsync + cudaStreamSynchronize on default stream equal to cudaMemcpy (non-async) CUDA Programming and Performance	7	4311	December 12, 2019
Fail to sync the cudaMemcpyAsync using the cudaEvent in two streams CUDA Programming and Performance	4	317	April 1, 2024
cudaMemcpyAsync CUDA Programming and Performance	10	22029	October 16, 2015
stream synchronize problem CUDA Programming and Performance	2	786	August 28, 2017
is kernel in stream 0 asynchronous? CUDA Programming and Performance	10	3825	April 23, 2011
Unable to synchronize with a specific stream CUDA Programming and Performance	1	7004	May 21, 2011
Memset/memcpyDtoD implicitly synchronizes all streams -- a way to disable it? CUDA Programming and Performance	5	676	August 23, 2023
cudaMemcpyAsync waiting for another unrelated cudaMemcpyAsync CUDA Programming and Performance cuda	10	203	December 10, 2024
Overlap cudaMemcpyAsync and kernel CUDA Programming and Performance	1	537	February 10, 2021

Why some synchronize function make cudaMemcpyAsync and kernal in different stream work in sequential

Related topics