Please consider the following example code:
#include <stdio.h>
typedef struct{
int x[4];
} s1_t;
__device__ s1_t create_s1() {
s1_t s;
s.x[0] = 0;
s.x[1] = 0;
s.x[2] = 0;
s.x[3] = 0;
return s;
}
typedef struct{
s1_t *f0;
s1_t *f1;
s1_t *f2;
s1_t *f3;
s1_t *f4;
s1_t *f5;
s1_t *f6;
s1_t *f7;
s1_t *f8;
s1_t *f9;
s1_t *f10;
s1_t *f11;
s1_t *f12;
s1_t *f13;
s1_t *f14;
s1_t *f15;
s1_t *f16;
s1_t *f17;
s1_t *f18;
s1_t *f19;
s1_t *f20;
s1_t *f21;
s1_t *f22;
s1_t *f23;
s1_t *f24;
s1_t *f25;
s1_t *f26;
s1_t *f27;
s1_t *f28;
s1_t *f29;
s1_t *f30;
s1_t *f31;
s1_t *f32;
s1_t *f33;
s1_t *f34;
s1_t *f35;
s1_t *f36;
s1_t *f37;
s1_t *f38;
s1_t *f39;
s1_t *f40;
s1_t *f41;
s1_t *f42;
s1_t *f43;
s1_t *f44;
s1_t *f45;
s1_t *f46;
s1_t *f47;
s1_t *f48;
s1_t *f49;
s1_t *f50;
s1_t *f51;
s1_t *f52;
s1_t *f53;
s1_t *f54;
s1_t *f55;
s1_t *f56;
s1_t *f57;
s1_t *f58;
s1_t *f59;
s1_t *f60;
s1_t *f61;
s1_t *f62;
s1_t *f63;
#ifdef _USE65
s1_t *f64;
#endif
} f_container_t;
f_container_t *f_container = NULL;
__global__ void init_f_kernel (int N, f_container_t *fc){
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid >= N) return;
fc->f0[tid] = create_s1();
fc->f1[tid] = create_s1();
fc->f2[tid] = create_s1();
fc->f3[tid] = create_s1();
fc->f4[tid] = create_s1();
fc->f5[tid] = create_s1();
fc->f6[tid] = create_s1();
fc->f7[tid] = create_s1();
fc->f8[tid] = create_s1();
fc->f9[tid] = create_s1();
fc->f10[tid] = create_s1();
fc->f11[tid] = create_s1();
fc->f12[tid] = create_s1();
fc->f13[tid] = create_s1();
fc->f14[tid] = create_s1();
fc->f15[tid] = create_s1();
fc->f16[tid] = create_s1();
fc->f17[tid] = create_s1();
fc->f18[tid] = create_s1();
fc->f19[tid] = create_s1();
fc->f20[tid] = create_s1();
fc->f21[tid] = create_s1();
fc->f22[tid] = create_s1();
fc->f23[tid] = create_s1();
fc->f24[tid] = create_s1();
fc->f25[tid] = create_s1();
fc->f26[tid] = create_s1();
fc->f27[tid] = create_s1();
fc->f28[tid] = create_s1();
fc->f29[tid] = create_s1();
fc->f30[tid] = create_s1();
fc->f31[tid] = create_s1();
fc->f32[tid] = create_s1();
fc->f33[tid] = create_s1();
fc->f34[tid] = create_s1();
fc->f35[tid] = create_s1();
fc->f36[tid] = create_s1();
fc->f37[tid] = create_s1();
fc->f38[tid] = create_s1();
fc->f39[tid] = create_s1();
fc->f40[tid] = create_s1();
fc->f41[tid] = create_s1();
fc->f42[tid] = create_s1();
fc->f43[tid] = create_s1();
fc->f44[tid] = create_s1();
fc->f45[tid] = create_s1();
fc->f46[tid] = create_s1();
fc->f47[tid] = create_s1();
fc->f48[tid] = create_s1();
fc->f49[tid] = create_s1();
fc->f50[tid] = create_s1();
fc->f51[tid] = create_s1();
fc->f52[tid] = create_s1();
fc->f53[tid] = create_s1();
fc->f54[tid] = create_s1();
fc->f55[tid] = create_s1();
fc->f56[tid] = create_s1();
fc->f57[tid] = create_s1();
fc->f58[tid] = create_s1();
fc->f59[tid] = create_s1();
fc->f60[tid] = create_s1();
fc->f61[tid] = create_s1();
fc->f62[tid] = create_s1();
fc->f63[tid] = create_s1();
#ifdef _USE65
fc->f64[tid] = create_s1();
#endif
}
int main (int argc, char *argv[]) {
int batch_size = 5040;
cudaMallocManaged((void**)&f_container, sizeof(f_container_t*));
cudaMallocManaged((void**)&f_container->f0, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f1, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f2, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f3, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f4, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f5, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f6, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f7, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f8, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f9, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f10, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f11, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f12, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f13, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f14, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f15, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f16, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f17, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f18, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f19, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f20, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f21, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f22, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f23, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f24, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f25, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f26, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f27, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f28, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f29, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f30, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f31, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f32, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f33, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f34, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f35, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f36, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f37, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f38, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f39, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f40, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f41, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f42, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f43, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f44, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f45, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f46, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f47, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f48, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f49, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f50, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f51, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f52, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f53, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f54, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f55, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f56, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f57, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f58, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f59, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f60, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f61, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f62, batch_size * sizeof(s1_t));
cudaMallocManaged((void**)&f_container->f63, batch_size * sizeof(s1_t));
#ifdef _USE65
cudaMallocManaged((void**)&f_container->f64, batch_size * sizeof(s1_t));
#endif
init_f_kernel<<<40,128>>> (batch_size, f_container);
cudaDeviceSynchronize();
printf ("cudaState: %s\n", cudaGetErrorString(cudaGetLastError()));
}
I have a struct with arrays of other structs in it. I first use cudaMallocManaged to allocate all these components on the device, then I use a kernel to initialize the fields to zero with a device function. The number of threads per block is 128 and the number of blocks is 40. These are arbitrary values that I took from the original program.
There are 64 arrays in the f_container
, respectively 65 if _USE65
is set. The program works fine with 64 fields (cudaState: no error
). But with 65 fields (and beyond) I get cudaState: an illegal memory access was encountered
. I see no obvious error and I’m suspecting that I’m running into some limitations of managed memory (64 seems suspicious). Am I missing out on something or might this be a bug?
Additional infos:
- I compile with nvcc version 12.6.20. No additional flags are used.
- Reducing the number threads per block (and increasing blocks accordingly) does not have an effect,
- Omitting the device function in favor of setting the values directly does not have an effect.