In the host code, I measured the total time of these six kernel launchs with CUDA timer. In the windows 7, the result is about 1 ms and in Linux , the result is 0.17ms. But in the output of Visual Profiler, the total consumed time of these kernel is 0.13ms! why there is such big diferences between these results? Thanks for your help!
cutCreateTimer(&timer);
cutStartTimer(timer);
cufftExecD2Z(Plan,cubfft,fft_result);
cudaThreadSynchronize();
// first kernel
tri_transpose<<<grid,threads>>>(fft_result,cu_result, argu);
cudaThreadSynchronize();
//second kernel
cudaFuncSetCacheConfig(tri_solver, cudaFuncCachePreferShared);
tri_solver<<<grid2,threads2>>>(cu_result,cu_act_trace,cu_tri_main);
cudaThreadSynchronize();
// third kernel
tri_transpose2<<<grid,threads>>>(cu_result, cubfft);
cudaThreadSynchronize();
// fourth kernel
cufftExecD2Z(Plan,cubfft,fft_result);
cudaThreadSynchronize();
// fifth kernel
post_process<<<grid2,threads2>>>(fft_result,cu_result,argu);
// sixth kernel
cutStopTimer(timer);