Illegal access when using UM

Hello, I am having some strange illegal access when using UM in a cooperative group kernel.

The launch is as follows:

__global__
void kern(void * mm) {
    int * p = (int*)mm+0x3FFFF000; // last 4kb of the 1gb um buffer
    *p = 0; // illegal access here
}

int main() {
...
void * mm;
cudaMallocManaged(&mm,1024*1024*1024);
...
// setup launch params
int i;
cudaLaunchParams * lparams = ...

void ** params = (void**)malloc(1*sizeof(void*));
params[0] = (void*)&mm;

// devices is 4 here
for(i=0;i<devices;i++) {
    ...
    lparams[i].func = (void*)kern;
    lparams[i].args = params;
}

cudaLaunchCooperativeKernelMultiDevice(lparams,devices);

...

}

It is odd, since if I initialize the last 4kb page in the host before launching the kernel I get no access violation. Am I missing something here for UM usage?

This allocates 1 Gigabyte:

cudaMallocManaged(&mm,102410241024);

This appears to extend well beyond 1 Gigabyte:

(int*)mm+0x3FFFF000;

As far as I know, in the C++ order-of-operations precedence stack, the various kinds of type-casting occur before ordinary arithmetic. Therefore mm is first reinterpreted as a int pointer, and then it is offset by 0x3FFFF000 int quantities. That would extend out to something around 4 Gigabytes, because my calculator tells me that:

0x3FFFF000 = 1,073,737,728

so, when that offset is applied to a int pointer, the corresponding byte offset is something like 1,073,737,728*sizeof(int) = 4,294,950,912

Since you’ve not provided a complete test case, I can only offer comments based on what I see of what you have posted. This may or may not be related to observations, and I can’t explain the behavior of code you haven’t shown. I encourage people who want help to provide complete test cases.

You are right, I am sorry, it was an error when copying and pasting. The correct code is as follows:

(int*)mm+0x3FFFF000/4;

I didn’t have any problems when running a test case built around the code you have shown:

$ nvidia-smi
Mon Mar 11 09:32:46 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.xx       Driver Version: 418.xx       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  On   | 00000000:05:00.0 Off |                    0 |
| N/A   33C    P0    24W / 250W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   34C    P0    24W / 250W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-PCIE...  On   | 00000000:84:00.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-PCIE...  On   | 00000000:85:00.0 Off |                    0 |
| N/A   36C    P0    30W / 250W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
$ cat t470.cu
#include <cooperative_groups.h>
#include <assert.h>

namespace cg = cooperative_groups;

__global__
void kern(void * mm) {
    int * p = (int*)mm+0x3FFFF000/4; // last 4kb of the 1gb um buffer
    *p = 1; // illegal access here
}

int main() {
  void * mm;
  cudaMallocManaged(&mm,1024*1024*1024);
// setup launch params
  const int devices = 4;
  cudaLaunchParams * lparams = (cudaLaunchParams *)malloc(sizeof(cudaLaunchParams) * devices);
  cudaStream_t streams[devices];
  int ccminor, ccmajor;
  for (int i = 0; i < devices; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    if (i==0){ccminor = prop.minor; ccmajor = prop.major;}
    if (i > 0){
      assert(prop.minor == ccminor);
      assert(prop.major == ccmajor);}
    assert(prop.concurrentManagedAccess != 0);
    assert(prop.cooperativeMultiDeviceLaunch != 0);
    cudaSetDevice(i);
    cudaStreamCreate(streams+i);}
  void ** params = (void**)malloc(1*sizeof(void*));
  params[0] = (void*)&mm;

// devices is 4 here
  for(int i=0;i<devices;i++) {
    memset(lparams+i, 0, sizeof(cudaLaunchParams));
    lparams[i].func = (void*)kern;
    lparams[i].gridDim = 1;
    lparams[i].blockDim = 1;
    lparams[i].stream = streams[i];
    lparams[i].args = params;
  }
  cudaSetDevice(0);
  cudaLaunchCooperativeKernelMultiDevice(lparams,devices, cudaCooperativeLaunchMultiDeviceNoPreSync | cudaCooperativeLaunchMultiDeviceNoPostSync);
  for (int i = 0; i < devices; i++){
    cudaSetDevice(i);
    cudaStreamSynchronize(streams[i]);
  }
  return 0;
}
$ nvcc -rdc=true -o t470 t470.cu -lcudadevrt
$ cuda-memcheck ./t470
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

My guess is the problem lies in something you haven’t shown or haven’t described.