Thanks for the answers.
@txbob
I will open a new specific thread for NPP function overhead related to my problem.
@njuffa and @HannesF99
A warm up phase is present and so I think it isn’t the answer.
Here the code. Is partially derived from HannesF99 sample presented in https://devtalk.nvidia.com/default/topic/963440/cudamalloc-pitch-significantly-slower-on-windows-with-geforce-drivers-gt-350-12/
#include <string>
#include <iostream>
#include <chrono>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime_api.h>
class timer
{
private:
std::chrono::time_point<std::chrono::high_resolution_clock> m_start;
std::chrono::time_point<std::chrono::high_resolution_clock> m_stop;
std::string m_msg;
public:
timer() :
m_start(std::chrono::high_resolution_clock::now())
{}
double timeElapsed()
{
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_seconds = end - m_start;
return elapsed_seconds.count();
}
};
void testCudaMallocPitch(bool bFree)
{
// force context creation
cudaFree(0);
// width and height of image 'k', in byte -> for an image with ~ 1 MB, 10 MB, 20 MB and 400 MB
int width[4] = { 1000, 3000, 6000, 12000 };
int height[4] = { 1000, 3200, 3200, 32000 };
cudaEvent_t start, stop;
void* ptr;
size_t dummy2;
// warm-up
cudaError_t er = cudaMallocPitch(&ptr, &dummy2, 1000, 1000);
if (cudaSuccess != er)
throw std::runtime_error("cudaMallocPitch");
er = cudaFree(ptr);
if (cudaSuccess != er)
throw std::runtime_error("cudaFree");
// events
cudaEventCreate(&start);
cudaEventCreate(&stop);
// now measure the runtime for a cudaMallocPitch of an image with ~ 1 MB, 20 MB and 400 MB
// two measure are taken for each allocation size
for (int k = 0; k < sizeof(width) / sizeof(width[0]); ++k)
{
std::cout << "Allocation of one image with " << (width[k] * height[k] / 1e6) << " MB\n";
for (int jj = 0; jj < 3; ++jj)
{
timer StartCounter;
void* bufferPtr = 0;
size_t pitch = 0;
cudaEventRecord(start, 0);
er = cudaMallocPitch(&bufferPtr, &pitch, width[k], height[k]);
cudaEventRecord(stop, 0);
if (er != cudaSuccess)
throw std::runtime_error("cudaMallocPitch");
double osTime = StartCounter.timeElapsed();
float cudaTimeInMs=0;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&cudaTimeInMs, start, stop);
if (bFree)
{
er = cudaFree(bufferPtr);
if (cudaSuccess != er)
throw std::runtime_error("cudaFree");
}
std::cout << "cuda time : " << cudaTimeInMs << " ms ";
std::cout << "os time : " << osTime * 1000 << " ms \n";
}
}
cudaEventDestroy(stop);
cudaEventDestroy(start);
}
void testcudaGetDeviceProperties()
{
// force context creation
cudaFree(0);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// warmup
struct cudaDeviceProp prop;
cudaError_t e = cudaGetDeviceProperties(&prop, 0);
// now measure the runtime for a cudaGetDeviceProperties of an image with ~ 1 MB, 20 MB and 400 MB
for (int k = 0; k < 10; ++k)
{
timer StartCounter;
cudaEventRecord(start);
cudaError_t e = cudaGetDeviceProperties(&prop, 0);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
double osTime = StartCounter.timeElapsed();
float cudaTimeInMs;
cudaEventElapsedTime(&cudaTimeInMs, start, stop);
std::cout << "cuda time : " << cudaTimeInMs << " ms ";
std::cout << "os time : " << osTime * 1000 << " ms \n";
}
cudaEventDestroy(stop);
cudaEventDestroy(start);
}
int main()
{
std::cout << "\t cudaGetDeviceProperties\n\n";
testcudaGetDeviceProperties();
std::cout << "\n\n\t testCudaMallocPitch with cudaFree\n\n";
testCudaMallocPitch(true);
std::cout << "\n\n\t testCudaMallocPitch without cudaFree\n\n";
testCudaMallocPitch(false);
return 0;
}
The program output for my configuration is :
cudaGetDeviceProperties
cuda time : 0.001632 ms os time : 0.999552 ms
cuda time : 0.001632 ms os time : 0.962961 ms
cuda time : 0.001632 ms os time : 0.97724 ms
cuda time : 0.001632 ms os time : 0.958945 ms
cuda time : 0.001632 ms os time : 0.958499 ms
cuda time : 0.001632 ms os time : 1.14145 ms
cuda time : 0.001728 ms os time : 2.98393 ms
cuda time : 0.001632 ms os time : 1.34582 ms
cuda time : 0.001664 ms os time : 1.05979 ms
cuda time : 0.001664 ms os time : 1.05533 ms
testCudaMallocPitch with cudaFree
Allocation of one image with 1 MB
cuda time : 0.001632 ms os time : 1.14458 ms
cuda time : 0.001632 ms os time : 0.599731 ms
cuda time : 0.001632 ms os time : 0.589914 ms
Allocation of one image with 9.6 MB
cuda time : 0.001664 ms os time : 2.10486 ms
cuda time : 0.001632 ms os time : 5.35072 ms
cuda time : 0.001632 ms os time : 3.01918 ms
Allocation of one image with 19.2 MB
cuda time : 0.001632 ms os time : 4.22935 ms
cuda time : 0.001664 ms os time : 9.38463 ms
cuda time : 0.001632 ms os time : 5.33645 ms
Allocation of one image with 384 MB
cuda time : 0.00176 ms os time : 71.0601 ms
cuda time : 0.001632 ms os time : 98.4781 ms
cuda time : 0.001632 ms os time : 97.8061 ms
testCudaMallocPitch without cudaFree
Allocation of one image with 1 MB
cuda time : 0.001664 ms os time : 0.86479 ms
cuda time : 0.001632 ms os time : 0.556001 ms
cuda time : 0.001728 ms os time : 0.564925 ms
Allocation of one image with 9.6 MB
cuda time : 0.001664 ms os time : 2.09504 ms
cuda time : 0.001632 ms os time : 2.10798 ms
cuda time : 0.001664 ms os time : 2.13431 ms
Allocation of one image with 19.2 MB
cuda time : 0.001632 ms os time : 4.01962 ms
cuda time : 0.00176 ms os time : 5.31993 ms
cuda time : 0.001632 ms os time : 3.94778 ms
Allocation of one image with 384 MB
cuda time : 0.001632 ms os time : 75.2377 ms
cuda time : 0.001632 ms os time : 71.7473 ms
cuda time : 0.001664 ms os time : 71.6317 ms
- cudaGetDeviceProperties average time measured with os high-precision timer is about 1ms and it is too long for this function.
-
another problem arise : the same time measured using cuda events is very different (wrong).
- Nsight show that os time is the correct one. This is an extract from Nsight log
function Duration(μs)
cudaEventRecord 7.279
cudaGetDeviceProperties 1036.729
cudaEventRecord 6.644
cudaEventSynchronize 48.578
- cudaMallocPitch time depend from allocation size and from allocation history. Timing is different if a cudaFree is executed after each cudaMallocPitch or not
- same problem using cudaEvent for timing. Nsight show again that os time is the right one
I forgot something using cuda events?!?!?
Thanks
results.xlsx (20.3 KB)