If I allocate some data via dynamic global memory allocation by using operator new inside a kernel, I get cuda-memcheck errors when I try to access that data in a kernel launched in a non-default stream. Here’s the code:
#include <cuda_runtime.h>
#include <iostream>
// Kernel to allocate and initialize data on the CUDA device. This
// should be called in a single thread.
__global__ void newKernel(float** dataHandle) {
float* data = new float[32];
for (int i = 0; i < 32; ++i) data[i] = i;
*dataHandle = data;
}
// Kernel to deallocate data on the CUDA device
__global__ void deleteKernel(float* data) {
delete[] data;
}
// Kernel to use device-allocated data, just returning double the
// values in the result array
__global__ void useData(const float* data, float* result) {
result[threadIdx.x] = 2 * data[threadIdx.x];
}
// Error handling
void checkError() {
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cout << "Error: " << cudaGetErrorName(err) << ": "
<< cudaGetErrorString(err) << std::endl;
}
}
// The host test routine
int main(int argc, const char** argv) {
// Allocate data on the device; copy pointer back to host for later use
float** dataHandle;
cudaMalloc(&dataHandle, sizeof(float*));
newKernel<<<1, 1>>>(dataHandle);
float* data;
cudaMemcpy(&data, dataHandle, sizeof(float*), cudaMemcpyDeviceToHost);
checkError();
// Use the data
float* devResult;
cudaMalloc(&devResult, 32 * sizeof(float));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaDeviceSynchronize();
bool useStream = true;
if (useStream) {
useData<<<1, 32, 0, stream>>>(data, devResult);
}
else {
useData<<<1, 32>>>(data, devResult);
}
checkError();
// Clean up
cudaStreamDestroy(stream);
cudaFree(devResult);
deleteKernel<<<1, 1>>>(data);
cudaFree(dataHandle);
checkError();
return 0;
}
This runs to completion with no errors. However, cuda-memcheck reports cudaErrorIllegalAddress on the checkError in the “Clean up” block. I get a more precise error running in cuda-gdb with “set cuda memcheck on,” namely a Warp Illegal Address in the useData kernel. If I set the useStream variable to false, so useData runs in the default stream, then there are no memcheck errors. This is with a GT 1030, CUDA 8.0.61, Ubuntu 16.04.
I haven’t seen anything in the documentation or any other forum that heap memory can’t be accessed using non-default streams. What am I missing? Thanks in advance for any help.