As the Cuda document says, “It is not permitted for the CPU to access any managed allocations or variables while the GPU is active for devices with concurrentManagedAccess property set to 0. On these systems concurrent CPU/GPU accesses, even to different managed memory allocations, will cause a segmentation fault because the page is considered inaccessible to the CPU”.
I have a simple test code, memory is allocated with cudaMallocManaged and cudaMemAttachHost, CPU and GPU can concurrent access the managed memory and the code works fine。
so, why not this test code case segmentation fault ? Is it correct to use cudaMallocManaged like that?
#include <stdio.h>
#include <thread>
#include <unistd.h>
// error checking macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int DSIZE = 4096;
const int block_size = 256; // CUDA maximum is 1024
// vector add kernel: C = A + B
__global__ void vadd(const float *A, const float *B, float *C, int ds){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < ds)
C[idx] = A[idx] + B[idx];
}
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
void loop1 () {
cudaStream_t stream;
cudaStreamCreate(&stream);
while(1) {
vadd<<<(DSIZE+block_size-1)/block_size, block_size,1>>>(h_A, h_B, h_C, DSIZE);
cudaCheckErrors("kernel launch failure");
printf("A[0] = %f\n", h_A[0]);
printf("B[0] = %f\n", h_B[0]);
printf("C[0] = %f\n", h_C[0]);
}
}
int main(){
cudaMallocManaged(&h_A,DSIZE*sizeof(float),cudaMemAttachHost);
cudaMallocManaged(&h_B,DSIZE*sizeof(float),cudaMemAttachHost);
cudaMallocManaged(&h_C,DSIZE*sizeof(float),cudaMemAttachHost);
cudaMallocManaged(&d_A,DSIZE*sizeof(float),cudaMemAttachHost);
cudaMallocManaged(&d_B,DSIZE*sizeof(float),cudaMemAttachHost);
cudaCheckErrors("cudaMallocManaged fail");
for (int i = 0; i < DSIZE; i++){
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0;}
std::thread thr(loop1);
sleep(1);
while(1) {
d_A[0] ++;
d_B[0] ++;
printf("---- A[0] = %f\n", h_A[0]);
printf("---- B[0] = %f\n", h_B[0]);
printf("---- C[0] = %f\n", h_C[0]);
printf("---- A[0] = %f\n", d_A[0]);
printf("---- B[0] = %f\n", d_B[0]);
}
while(1) {
sleep(1000);
}
return 0;
}