Anybody has suceesful experience on cuda 4.0’s new feature cudaHostRegister? I google this, but I don’t find very clear good sample code. Here is my problematic code. After calling cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0), I found devPtrA is still NULL. So the kernel was not called successfully.
// printf() is only supported // for devices of compute capability 2.0 and above
/*
#if defined(CUDA_ARCH) && (CUDA_ARCH < 200)
#define printf(f, …) ((void)(f, VA_ARGS),0)
*/
#include <stdio.h>
#define SIZE 10
#include <cuda.h>
// Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
global void vecAdd(float* A, float* B, float* C) {
// threadIdx.x is a built-in variable provided by CUDA at runtime
int i = threadIdx.x;
// A[i] = 0;
// B[i] = i;
C[i] = A[i] + B[i];
// printf(“A[%d]=%f, B[%d]=%f, C[%d]=%f\n”, i, A[i], i, B[i], i, C[i]);
}
int main() {
int N = SIZE;
int memsize = SIZE * sizeof(float);
cudaDeviceProp deviceProp;
// Get properties and verify device 0 supports mapped memory
cudaGetDeviceProperties(&deviceProp, 0);
if (!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d cannot map host memory!\n", 0);
exit(EXIT_FAILURE);
}
// set the device flags for mapping host memory
cudaSetDeviceFlags(cudaDeviceMapHost);
// float A, B, C;
float * A, *B, *C;
float *devPtrA, *devPtrB, *devPtrC;
A = (float*) malloc(memsize);
B = (float*) malloc(memsize);
C = (float*) malloc(memsize);
cudaHostRegister(A, memsize, cudaHostRegisterMapped);
cudaHostRegister(B, memsize, cudaHostRegisterMapped);
cudaHostRegister(C, memsize, cudaHostRegisterMapped);
for (int i = 0; i < SIZE; i++) {
A[i] = B[i] = i;
}
cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0);
cudaHostGetDevicePointer((void **) &devPtrB, (void *) B, 0);
cudaHostGetDevicePointer((void **) &devPtrC, (void *) C, 0);
vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
cudaDeviceSynchronize();
for (int i = 0; i < SIZE; i++)
printf("C[%d]=%f\n", i, C[i]);
cudaHostUnregister(A);
cudaHostUnregister(B);
cudaHostUnregister(C);
free(A);
free(B);
free(C);
}