To illustrate the issue, here is a little program showing the problem. It might be that I’m overlooking some addressing mode or something like that.
The output of this program:
__global__ void kernel(int *value) {
printf("kernel value = %p\n", value);
*value = 2;
}
int main() {
int *value = NULL, *devValue = NULL;
cudaDeviceProp prop;
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaGetDeviceProperties(&prop, 0);
printf("prof.unifiedAddressing = %i\n", prop.unifiedAddressing);
unsigned int flags = cudaHostAllocDefault;
cudaHostAlloc((void **)&value, sizeof(int), flags);
cudaHostGetDevicePointer(&devValue, value, 0);
printf("value = %p, devValue = %p\n", value, devValue);
*value = 1;
printf("value = %i\n", *value);
kernel<<<1, 1>>>(value);
cudaDeviceSynchronize();
printf("value = %i\n", *value);
cudaFreeHost(value);
}
is as expected, with cuda-memcheck not giving any errors:
========= CUDA-MEMCHECK
prof.unifiedAddressing = 1
value = 0x200100000, devValue = 0x200100000
value = 1
kernel value = 0x200100000
value = 2
========= ERROR SUMMARY: 0 errors
Changing the flag on line 17 to cudaHostAllocWriteCombined, changes the value and devValue pointers to
value = 0x2b7378b1f000, devValue = 0x200100000
Which then obviously creates errors when the kernel is being executed:
========= Invalid __global__ write of size 4
========= at 0x00000148 in /direct/nanofun/mdereus/projects/gpuMatrixMem/test.cu:11:kernel(int*)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x2b7378b1f000 is out of bounds
I don’t understand why in the first situation the pointers are equal and in the second they are different.