Struct allocated with cudaMallocManaged fails up from a certain size

Please consider the following example code:

#include <stdio.h>
typedef struct{
    int x[4];
 } s1_t;

__device__ s1_t create_s1() {
   s1_t s;
   s.x[0] = 0;
   s.x[1] = 0;
   s.x[2] = 0;
   s.x[3] = 0;
   return s;
 }

typedef struct{
   s1_t *f0;
   s1_t *f1;
   s1_t *f2;
   s1_t *f3;
   s1_t *f4;
   s1_t *f5;
   s1_t *f6;
   s1_t *f7;
   s1_t *f8;
   s1_t *f9;
   s1_t *f10;
   s1_t *f11;
   s1_t *f12;
   s1_t *f13;
   s1_t *f14;
   s1_t *f15;
   s1_t *f16;
   s1_t *f17;
   s1_t *f18;
   s1_t *f19;
   s1_t *f20;
   s1_t *f21;
   s1_t *f22;
   s1_t *f23;
   s1_t *f24;
   s1_t *f25;
   s1_t *f26;
   s1_t *f27;
   s1_t *f28;
   s1_t *f29;
   s1_t *f30;
   s1_t *f31;
   s1_t *f32;
   s1_t *f33;
   s1_t *f34;
   s1_t *f35;
   s1_t *f36;
   s1_t *f37;
   s1_t *f38;
   s1_t *f39;
   s1_t *f40;
   s1_t *f41;
   s1_t *f42;
   s1_t *f43;
   s1_t *f44;
   s1_t *f45;
   s1_t *f46;
   s1_t *f47;
   s1_t *f48;
   s1_t *f49;
   s1_t *f50;
   s1_t *f51;
   s1_t *f52;
   s1_t *f53;
   s1_t *f54;
   s1_t *f55;
   s1_t *f56;
   s1_t *f57;
   s1_t *f58;
   s1_t *f59;
   s1_t *f60;
   s1_t *f61;
   s1_t *f62;
   s1_t *f63;
#ifdef _USE65
   s1_t *f64;
#endif
} f_container_t;

f_container_t *f_container = NULL;

__global__ void init_f_kernel (int N, f_container_t *fc){
  size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
  if (tid >= N) return;
   fc->f0[tid] = create_s1();
   fc->f1[tid] = create_s1();
   fc->f2[tid] = create_s1();
   fc->f3[tid] = create_s1();
   fc->f4[tid] = create_s1();
   fc->f5[tid] = create_s1();
   fc->f6[tid] = create_s1();
   fc->f7[tid] = create_s1();
   fc->f8[tid] = create_s1();
   fc->f9[tid] = create_s1();
   fc->f10[tid] = create_s1();
   fc->f11[tid] = create_s1();
   fc->f12[tid] = create_s1();
   fc->f13[tid] = create_s1();
   fc->f14[tid] = create_s1();
   fc->f15[tid] = create_s1();
   fc->f16[tid] = create_s1();
   fc->f17[tid] = create_s1();
   fc->f18[tid] = create_s1();
   fc->f19[tid] = create_s1();
   fc->f20[tid] = create_s1();
   fc->f21[tid] = create_s1();
   fc->f22[tid] = create_s1();
   fc->f23[tid] = create_s1();
   fc->f24[tid] = create_s1();
   fc->f25[tid] = create_s1();
   fc->f26[tid] = create_s1();
   fc->f27[tid] = create_s1();
   fc->f28[tid] = create_s1();
   fc->f29[tid] = create_s1();
   fc->f30[tid] = create_s1();
   fc->f31[tid] = create_s1();
   fc->f32[tid] = create_s1();
   fc->f33[tid] = create_s1();
   fc->f34[tid] = create_s1();
   fc->f35[tid] = create_s1();
   fc->f36[tid] = create_s1();
   fc->f37[tid] = create_s1();
   fc->f38[tid] = create_s1();
   fc->f39[tid] = create_s1();
   fc->f40[tid] = create_s1();
   fc->f41[tid] = create_s1();
   fc->f42[tid] = create_s1();
   fc->f43[tid] = create_s1();
   fc->f44[tid] = create_s1();
   fc->f45[tid] = create_s1();
   fc->f46[tid] = create_s1();
   fc->f47[tid] = create_s1();
   fc->f48[tid] = create_s1();
   fc->f49[tid] = create_s1();
   fc->f50[tid] = create_s1();
   fc->f51[tid] = create_s1();
   fc->f52[tid] = create_s1();
   fc->f53[tid] = create_s1();
   fc->f54[tid] = create_s1();
   fc->f55[tid] = create_s1();
   fc->f56[tid] = create_s1();
   fc->f57[tid] = create_s1();
   fc->f58[tid] = create_s1();
   fc->f59[tid] = create_s1();
   fc->f60[tid] = create_s1();
   fc->f61[tid] = create_s1();
   fc->f62[tid] = create_s1();
   fc->f63[tid] = create_s1();
#ifdef _USE65
   fc->f64[tid] = create_s1();
#endif
}

int main (int argc, char *argv[]) {
   int batch_size = 5040;
   cudaMallocManaged((void**)&f_container, sizeof(f_container_t*));
   cudaMallocManaged((void**)&f_container->f0, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f1, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f2, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f3, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f4, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f5, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f6, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f7, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f8, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f9, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f10, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f11, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f12, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f13, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f14, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f15, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f16, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f17, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f18, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f19, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f20, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f21, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f22, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f23, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f24, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f25, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f26, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f27, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f28, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f29, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f30, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f31, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f32, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f33, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f34, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f35, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f36, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f37, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f38, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f39, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f40, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f41, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f42, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f43, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f44, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f45, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f46, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f47, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f48, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f49, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f50, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f51, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f52, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f53, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f54, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f55, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f56, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f57, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f58, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f59, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f60, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f61, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f62, batch_size * sizeof(s1_t));
   cudaMallocManaged((void**)&f_container->f63, batch_size * sizeof(s1_t));
#ifdef _USE65
   cudaMallocManaged((void**)&f_container->f64, batch_size * sizeof(s1_t));
#endif
   init_f_kernel<<<40,128>>> (batch_size, f_container);
   cudaDeviceSynchronize();
   printf ("cudaState: %s\n", cudaGetErrorString(cudaGetLastError()));
}

I have a struct with arrays of other structs in it. I first use cudaMallocManaged to allocate all these components on the device, then I use a kernel to initialize the fields to zero with a device function. The number of threads per block is 128 and the number of blocks is 40. These are arbitrary values that I took from the original program.
There are 64 arrays in the f_container, respectively 65 if _USE65 is set. The program works fine with 64 fields (cudaState: no error). But with 65 fields (and beyond) I get cudaState: an illegal memory access was encountered. I see no obvious error and I’m suspecting that I’m running into some limitations of managed memory (64 seems suspicious). Am I missing out on something or might this be a bug?

Additional infos:

  • I compile with nvcc version 12.6.20. No additional flags are used.
  • Reducing the number threads per block (and increasing blocks accordingly) does not have an effect,
  • Omitting the device function in favor of setting the values directly does not have an effect.

I think this gives the size of a pointer to f_container (8 bytes), can you try without the * (65 * 8 bytes)?

Perhaps you mixed it up with the also possible sizeof(*f_container).

Probably the managed memory is allocated in chunks of 512 bytes or 64 pointers with 8 bytes each and the 65th pointer does not fit, if the requested size is not large enough.

1 Like

^ Yes.

Even without defining the _USE65, the code will fail as-is under compute-sanitizer, which does more precise out-of-bounds checking. Changing this:

cudaMallocManaged((void**)&f_container, sizeof(f_container_t*));

to this

cudaMallocManaged((void**)&f_container, sizeof(f_container_t));

fixes both cases (with or without _USE65, with or without compute-sanitizer) according to my testing.

Yes, that’s it. I just wrote it like I would in a pure CPU-code, where it is obvious that it needs to be that way. Thanks a lot.

I am still a bit puzzled that it made no problems with a smaller number of fields. I checked the working program with compute-sanitizer and got 5042 errors (more than were displayed). I just assumed that this is something else because there was no CUDA error, and ignored compute-sanitizer for the failing problem then.

As said, the granularity of managed memory allocations is probably 512 bytes.