...
...
...
cudaMalloc((void**)&d_total, n*(n-1)/2*sizeof(unsigned int));
for(float i=0; i<100; i+=0.0005)
{
...
...
...
RunKernel<<<blockNum, threadNum>>>(n, d_total, d_a, d_b, d_c, d_d, d_e, d_f);
cutStartTimer(hTimer);
cudaMemcpy(_total, d_total, n*(n-1)/2*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cutStopTimer(hTimer);
}
cudaFree(d_total)
cudaFree...
...
...
Time consumption
.
.
.
25.351915
25.511745
25.542639
25.459280
25.367662
25.597145
25.694475
589.564331
589.044983
588.816833
592.139099
591.546204
591.162537
.
.
When “i=0, 0.0005, 0.001, 0.0015… to i=20”, the time consumption for each value that is about 25 milliseconds, but when “i” is large than 20 or the other number, I will get about 589 milliseconds.
I have a qustion:
The size of d_total that is fixed. → n*(n-1)/2*sizeof(unsigned int)
Why will the time consumption of Memcpy be changed so largely?
Please forgive my poor English.
Many thanks.
Thank you.
:blink: