The following is a part of the transpose code in cuda sdk example
// warmup so we don’t time CUDA startup
transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
int numIterations = 1;
printf("Transposing a %d by %d matrix of floats...\n", size_x, size_y);
// execute the kernel
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y); //first is naive transpose :rolleyes:
}
cudaThreadSynchronize();
cutStopTimer(timer);
float naiveTime = cutGetTimerValue(timer);
// execute the kernel
cutResetTimer(timer);
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y); //second is optimized transpose :rolleyes:
}
cudaThreadSynchronize();
cutStopTimer(timer);
float optimizedTime = cutGetTimerValue(timer);
printf("Naive transpose average time: %0.3f ms\n", naiveTime / numIterations);
printf(“Optimized transpose average time: %0.3f ms\n\n”, optimizedTime / numIterations);
:mellow: I run the code and time the naive transpose and optimized transpose.
The naiveTime is about 17.6 ms, and the optimizedTime is 0.581 ms.
Then I change the code.
// warmup so we don’t time CUDA startup
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y); // I change the order :rolleyes:
int numIterations = 1;
printf("Transposing a %d by %d matrix of floats...\n", size_x, size_y);
// execute the kernel
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y); //first is optimized transpose :rolleyes:
}
cudaThreadSynchronize();
cutStopTimer(timer);
float optimizedTime = cutGetTimerValue(timer);
// execute the kernel
cutResetTimer(timer);
cutStartTimer(timer);
for (int i = 0; i < numIterations; ++i)
{
Transpose_naive<<< grid, threads >>>(d_odata, d_idata, size_x, size_y); // second is naive transpose :rolleyes:
}
cudaThreadSynchronize();
cutStopTimer(timer);
float naiveTime = cutGetTimerValue(timer);
printf("Naive transpose average time: %0.3f ms\n", naiveTime / numIterations);
printf(“Optimized transpose average time: %0.3f ms\n\n”, optimizedTime / numIterations);
:rolleyes: Then I run the code and time the naiveTime and optimizedTime.
What is a big surprise!
The naiveTime is about 8.23ms, and the optimizedTime is about 10.13ms. :unsure:
In my expectation, the results should be same.
Why it is not? :wacko:
Call for help!!! Thank you very much!