Why some synchronize function make cudaMemcpyAsync and kernal in different stream work in sequential

I test my program on GTX 280,win 7 with cuda 3.2
This is my test code,it’s a little long:
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <Windows.h>
#include <process.h>

HANDLE signal;
void _cdecl test(void *param);

unsigned int host[2];
unsigned int num=50000000;
unsigned int memsize=num
sizeof(int);
unsigned int t=500;//num/t equals to the total thread num in kernal

global void
kernal(unsigned int *input,unsigned int output)
{
unsigned int tid=blockDim.x
blockIdx.x+threadIdx.x;
unsigned int temp;
unsigned int p;
p=input[tid];
for(int i=0;i<1000;i++)
{
temp+=input[p];
p=input[p];
}
output[tid]=temp;
}
void mykernal(unsigned int *input,unsigned int *output,unsigned int num,cudaStream_t &stream)
{
unsigned int blockNum=(num+255)/256;
unsigned int threadNum=256;
kernal<<<blockNum,threadNum,0,stream>>>(input,output);
}
int main()
{
unsigned int deviceId[2]={0,1};
signal=CreateSemaphore(NULL,0,2,NULL);
cutilSafeCall(cudaHostAlloc(&host[0],memsize,cudaHostAllocPortable));
_beginthread(test,0,&deviceId[0]);
WaitForSingleObject(signal,INFINITE);
}
void _cdecl test(void param)
{
unsigned int deviceId=
(unsigned int *)param;
cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));
cutilSafeCall(cudaSetDevice(deviceId));

unsigned int *input1=NULL,*output=NULL;

cutilSafeCall(cudaMalloc(&input1,memsize));
cutilSafeCall(cudaMalloc(&output,memsize));

cudaStream_t streamForCalc,streamForTransfer;
cutilSafeCall(cudaStreamCreate(&streamForCalc));
cutilSafeCall(cudaStreamCreate(&streamForTransfer));

unsigned int timer;

cutCreateTimer(&timer);
cutResetTimer(timer);
cudaEvent_t eventForCalc,eventForTransfer;
//each element(index<num/t)point to another element in input1 array
unsigned int *h=new unsigned int[num];
srand(4);
int temp;
for(int i=0;i<num/t;i++)
{
	temp=rand();
	temp=temp>0?temp:-temp;
	h[i]=temp*73457%(num/t);
	if(h[i]<0) h[i]*=-1;
	h[i]=h[i]%(num/t);
	if(h[i]>=(num/t) || h[i]<0)
		printf("error %d\n",h[i]);
}
cutilSafeCall(cudaMemcpy(input1,h,(num/t)*sizeof(unsigned int),cudaMemcpyHostToDevice));
delete[] h;
//Create cudaEvent
cutilSafeCall(cudaEventCreateWithFlags(&eventForCalc,cudaEventBlockingSync));
cutilSafeCall(cudaEventCreateWithFlags(&eventForTransfer,cudaEventBlockingSync));
//open the log file
char recordbuffer[128];
FILE *logfile;
sprintf(recordbuffer,"GPU_%d_all_eventSync.txt",deviceId);
logfile=fopen(recordbuffer,"w");

for(int i=0;i<10;i++)
{
	fprintf(logfile,"%d",i);
	cutStartTimer(timer);
	//only data transfer
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//only kernal launched
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transfer,use cudaThreadSynchronize to synchronize
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaThreadSynchronize());
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transfer,use cudaStreamSynchronize to synchronize
	//cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	//mykernal(input1,output,num/t,streamForCalc);
	//cutilSafeCall(cudaStreamSynchronize(streamForTransfer));
	//cutilSafeCall(cudaStreamSynchronize(streamForCalc));
	//cutStopTimer(timer);
	//fprintf(logfile,",%f\n",cutGetTimerValue(timer));
	//cutResetTimer(timer);

	//kernal launched meanwhile data transfer,use cudaEventRecord to synchronize
	mykernal(input1,output,num/t,streamForCalc);
	cutilSafeCall(cudaMemcpyAsync(host[deviceId],input1,memsize,cudaMemcpyDeviceToHost,streamForTransfer));
	
	cutilSafeCall(cudaEventRecord(eventForCalc,streamForCalc));
	cutilSafeCall(cudaEventSynchronize(eventForCalc));
	cutStopTimer(timer);
	fprintf(logfile,",%f",cutGetTimerValue(timer));

	cutStartTimer(timer);

	cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer));	
	cutilSafeCall(cudaEventSynchronize(eventForTransfer));
	cutStopTimer(timer);
	fprintf(logfile,",%f",cutGetTimerValue(timer));

	cutResetTimer(timer);
	fprintf(logfile,"\n");
}
cutilSafeCall(cudaFreeHost(host[deviceId]));
cutilSafeCall(cudaFree(input1));
cutilSafeCall(cudaFree(output));
cutilSafeCall(cudaEventDestroy(eventForCalc));
cutilSafeCall(cudaEventDestroy(eventForTransfer));
cutilSafeCall(cudaStreamDestroy(streamForCalc));
cutilSafeCall(cudaStreamDestroy(streamForTransfer));
fclose(logfile);
ReleaseSemaphore(signal,1,NULL);

}//code end
my test result:
when only kernal is launched,the result is about 41 ms
when only cudaMemcpyAsync is called,the result is about 100 ms
then kernal is launched and meanwhile cudaMemcpyAsync is called
if(use cudaThreadSync to synchronize)
time is 100ms
if(use cudaStreamSync to synchronize)

time is 141ms//this is really strange
if(use cudaEventSync to synchronize)
{
if(cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer)); is called before cutilSafeCall(cudaEventSynchronize(eventForCalc));)
{
result is 40,100
}
if(cutilSafeCall(cudaEventRecord(eventForTransfer,streamForTransfer)); is called before cutilSafeCall(cudaEventSynchronize(eventForCalc));)
{
result is 40,140//this is really strange too
}
}
In addition,I use VS2008

sorry,I make a mistake,I use VS2010
Does this shows a bug,or there is something wrong in my configuration.
I am greatly appreciated if you answer my question