Hello guys .I’m trying to do call a kernel for matrix multiplication with zero copy (page-locked memory) but it seems there is a problem in calling the kernel don’t know what is it and the result on GPU calculation gave me zero 0000000.Does the cudaFreeHost ();
cause the problem here…
looking for help
//allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
//initialize host memory
Matrix_A(h_A, size_A);
Matrix_B(h_B, size_B);
//allocate device memory
float* d_A;
float* d_B;
//allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
//allocate device memory for the result
float* d_C;
cudaHostAlloc((void **)&h_A, mem_size_A , 0);
cudaHostAlloc((void **)&h_B, mem_size_B, 0);
//setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
//execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
cudaHostAlloc((void **)&h_C, mem_size_A , 0);
cudaFreeHost(&h_A);
cudaFreeHost(&h_B);
cudaFreeHost(&h_C);
Thanks