Sorry for the super basic question, but I had a c++ application where I performed a memcopy between memory allocated with cudaMallocHost and a regular ‘newed’ array, and I noticed it took a lot longer than when I did memcopy between two memory locations which were both allocated with cudaMallocHost.
I then ran some experiments and found if I simply try and memcopy a second time between the cudaMallocHost and normal ‘newed’ memory location, it was basically as fast as the two cudaMallocHost locations.
I also ran an experiment where I tried to ‘page lock’ the ‘regular newed’ memory location, and that didn’t seem to help.
Do you folks know what would theoretically be causing this?
Note: This is being run on google colab.
%%writefile memcopytest.cu
#include <iostream>
#include <chrono>
#include "cuda_fp16.h"
#include <sys/mman.h>
using namespace std;
using namespace std::chrono;
#define TWO_GIGS (2U*1024U*1024U*1024U)
int main()
{
unsigned int numOfFloatsForFourGigs = TWO_GIGS / sizeof(float);
float *myValues = new float[numOfFloatsForFourGigs];
float *myValues2 = NULL;
float *myValues3 = NULL;
cudaMallocHost(&myValues2, sizeof(float) * numOfFloatsForFourGigs);
cudaMallocHost(&myValues3, sizeof(float) * numOfFloatsForFourGigs);
/////////////////////////////////////////////////////
// Time a memcopy between 2 cuda managed arrays
auto start = high_resolution_clock::now();
memcpy(myValues2, myValues3, sizeof(float) * numOfFloatsForFourGigs);
auto end = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
cout << "Managed Memory Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
//////////////////////////////////////////////////////
// Time a memcopy between cuda managed array, and regular array.
start = high_resolution_clock::now();
memcpy(myValues, myValues3, sizeof(float) * numOfFloatsForFourGigs);
end = high_resolution_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "Regular Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
//////////////////////////////////////////////////////
// Copy again
// Time a memcopy between cuda managed array, and regular array.
start = high_resolution_clock::now();
memcpy(myValues, myValues3, sizeof(float) * numOfFloatsForFourGigs);
end = high_resolution_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "Second Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
return 0;
}
Here is the code for the pagelock attempt.
%%writefile memcopytest.cu
#include <iostream>
#include <chrono>
#include "cuda_fp16.h"
#include <sys/mman.h>
using namespace std;
using namespace std::chrono;
#define TWO_GIGS (2U*1024U*1024U*1024U)
int main()
{
unsigned int numOfFloatsForFourGigs = TWO_GIGS / sizeof(float);
float *myValues = new float[numOfFloatsForFourGigs];
float *myValues2 = NULL;
float *myValues3 = NULL;
cudaMallocHost(&myValues2, sizeof(float) * numOfFloatsForFourGigs);
cudaMallocHost(&myValues3, sizeof(float) * numOfFloatsForFourGigs);
/////////////////////////////////////////////////////
// Time a memcopy between 2 cuda managed arrays
auto start = high_resolution_clock::now();
memcpy(myValues2, myValues3, sizeof(float) * numOfFloatsForFourGigs);
auto end = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(end - start);
cout << "Managed Memory Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
//////////////////////////////////////////////////////
// Try mlock to see if page locked memory speeds up memcpy
mlock(myValues, sizeof(float) * numOfFloatsForFourGigs);
// Time a memcopy between cuda managed array, and regular array.
start = high_resolution_clock::now();
memcpy(myValues, myValues3, sizeof(float) * numOfFloatsForFourGigs);
end = high_resolution_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "Memlock Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
//////////////////////////////////////////////////////
// Time a memcopy between cuda managed array, and regular array.
start = high_resolution_clock::now();
memcpy(myValues, myValues3, sizeof(float) * numOfFloatsForFourGigs);
end = high_resolution_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "Regular Memcpy Duration: " << duration.count() / 1000 << " milliseconds" << endl;
//////////////////////////////////////////////////////
return 0;
}
Results for test 1
==12191== NVPROF is profiling process 12191, command: ./memcopytest
Managed Memory Memcpy Duration: 406 milliseconds
Regular Memcpy Duration: 1410 milliseconds
Second Memcpy Duration: 401 milliseconds
==12191== Profiling application: ./memcopytest
==12191== Profiling result:
No kernels were profiled.
Type Time(%) Time Calls Avg Min Max Name
API calls: 99.99% 2.16435s 2 1.08218s 1.00506s 1.15929s cudaHostAlloc
0.01% 166.68us 114 1.4620us 102ns 65.721us cuDeviceGetAttribute
0.00% 10.923us 1 10.923us 10.923us 10.923us cuDeviceGetName
0.00% 5.2240us 1 5.2240us 5.2240us 5.2240us cuDeviceGetPCIBusId
0.00% 1.2260us 3 408ns 145ns 885ns cuDeviceGetCount
0.00% 716ns 2 358ns 116ns 600ns cuDeviceGet
0.00% 412ns 1 412ns 412ns 412ns cuModuleGetLoadingMode
0.00% 397ns 1 397ns 397ns 397ns cuDeviceTotalMem
0.00% 217ns 1 217ns 217ns 217ns cuDeviceGetUuid
results for test 2
==11924== NVPROF is profiling process 11924, command: ./memcopytest
Managed Memory Memcpy Duration: 401 milliseconds
Memlock Memcpy Duration: 1413 milliseconds
Regular Memcpy Duration: 405 milliseconds
==11924== Profiling application: ./memcopytest
==11924== Profiling result:
No kernels were profiled.
Type Time(%) Time Calls Avg Min Max Name
API calls: 99.99% 2.15809s 2 1.07905s 999.56ms 1.15853s cudaHostAlloc
0.01% 138.46us 114 1.2140us 103ns 61.015us cuDeviceGetAttribute
0.00% 11.250us 1 11.250us 11.250us 11.250us cuDeviceGetName
0.00% 5.3160us 1 5.3160us 5.3160us 5.3160us cuDeviceGetPCIBusId
0.00% 1.6700us 3 556ns 149ns 1.3090us cuDeviceGetCount
0.00% 792ns 2 396ns 149ns 643ns cuDeviceGet
0.00% 566ns 1 566ns 566ns 566ns cuDeviceTotalMem
0.00% 480ns 1 480ns 480ns 480ns cuModuleGetLoadingMode
0.00% 248ns 1 248ns 248ns 248ns cuDeviceGetUuid
Thanks for the help!!