Hi,
I’m profiling this code and I see that every odd allocation of 1MB of pinned memory takes 600-800us, while every even allocation takes 4-5us.
Compiling it like this and then running with nvprof:
nvcc -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60 -lnvToolsExt host_malloc.cu -o cudaHostAllocTest
Any idea why this is happening?
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include <stdio.h>
#include <iostream>
using namespace std;
const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
const int num_colors = sizeof(colors)/sizeof(uint32_t);
int main()
{
cout << "Main" << std::endl;
const int N = 1000;
const size_t ALLOCATION_SIZE = 1024 * 1024;
int *data[N];
// Just to warm up the cuda runtime.
int *pdummy;
cudaMalloc((void **)&pdummy, 4);
char name[100];
nvtxEventAttributes_t eventAttrib;
for (int i = 0; i < N; i++)
{
int color_id = i;
color_id = color_id % num_colors;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = colors;
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
sprintf(name, "cudaHostAlloc.%d", i);
eventAttrib.message.ascii = name;
nvtxRangePushEx(&eventAttrib);
cudaHostAlloc(&(data[i]), ALLOCATION_SIZE, cudaHostAllocPortable);
nvtxRangePop();
}
for (int i = 0; i < N; i++)
{
data[i][i % 10] = i;
}
for (int i = 0; i < N; i++)
cudaFreeHost(data[i]);
cudaDeviceReset();
return 0;
}