Hi. Recently, I have been profiling the application using the nvbit instrument tool.
The goal is to use the Callback function to collect GPU device pointers in the cudamalloc() API, and to dump data that exist in addresses collected at the desired timing using the cudamemcpy.
Below is the code I wrote to collect the address of the cudamalloc() API.
std::map<const unsigned long long*, size_t> alloc_address_map;
void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
const char* name, void* params, CUresult* pStatus) {
pthread_mutex_lock(&mutex);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(error));
} else {
printf("No CUDA error.\n");
}
assert(cudaGetLastError() == cudaSuccess);
if(cbid == API_CUDA_cuMemAlloc ||
cbid == API_CUDA_cuMemAlloc_v2 ) {
cuMemAlloc_params* p_0 = (cuMemAlloc_params*)params;
auto it = alloc_address_map.find((unsigned long long*)(p_0->dptr));
if(it != alloc_address_map.end()) {
it->second = _bytesize;
} else {
alloc_address_map.insert({(unsigned long long*)(p_0->dptr),_bytesize});
}
}
}
If I perform the cudamemcpy() with these collected addresses as src ptr, an error will occur and the code will be terminated. What’s the problem? I also considered the area that I released as cudafree().
for(auto iter : alloc_address_map)
CUDA_SAFECALL(cudaMemcpy(h_m, (void*)*(iter.first), 4096, cudaMemcpyDeviceToHost));
Thank you for reading.