does anybody have experience on cudaHostRegister zero copy memory

Anybody has suceesful experience on cuda 4.0’s new feature cudaHostRegister? I google this, but I don’t find very clear good sample code. Here is my problematic code. After calling cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0), I found devPtrA is still NULL. So the kernel was not called successfully.

// printf() is only supported // for devices of compute capability 2.0 and above

/*

#if defined(CUDA_ARCH) && (CUDA_ARCH < 200)

#define printf(f, …) ((void)(f, VA_ARGS),0)

#endif

*/

#include <stdio.h>

#define SIZE 10

#include <cuda.h>

// Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide

global void vecAdd(float* A, float* B, float* C) {

// threadIdx.x is a built-in variable  provided by CUDA at runtime

int i = threadIdx.x;

//	A[i] = 0;

//	B[i] = i;

C[i] = A[i] + B[i];

// printf(“A[%d]=%f, B[%d]=%f, C[%d]=%f\n”, i, A[i], i, B[i], i, C[i]);

}

int main() {

int N = SIZE;

int memsize = SIZE * sizeof(float);

cudaDeviceProp deviceProp;

// Get properties and verify device 0 supports mapped memory

cudaGetDeviceProperties(&deviceProp, 0);

if (!deviceProp.canMapHostMemory) {

	fprintf(stderr, "Device %d cannot map host memory!\n", 0);

	exit(EXIT_FAILURE);

}

// set the device flags for mapping host memory

cudaSetDeviceFlags(cudaDeviceMapHost);

//  float A, B, C;

float * A, *B, *C;

float *devPtrA,	 *devPtrB,  *devPtrC;

 A = (float*) malloc(memsize);

 B = (float*) malloc(memsize);

 C = (float*) malloc(memsize);

 cudaHostRegister(A, memsize, cudaHostRegisterMapped);

 cudaHostRegister(B, memsize, cudaHostRegisterMapped);

 cudaHostRegister(C, memsize, cudaHostRegisterMapped);

for (int i = 0; i < SIZE; i++) {

	A[i] = B[i] = i;

}

cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0);

cudaHostGetDevicePointer((void **) &devPtrB, (void *) B, 0);

cudaHostGetDevicePointer((void **) &devPtrC, (void *) C, 0);



vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);

cudaDeviceSynchronize();

for (int i = 0; i < SIZE; i++)

	printf("C[%d]=%f\n", i, C[i]);

cudaHostUnregister(A);

cudaHostUnregister(B);

cudaHostUnregister(C);

 free(A);

 free(B);

 free(C);

}

AFAIR only whole pages can be mapped, so both the address and the size need to be page aligned.

In this example, how should I align both the address and size?

Use valloc or posix_memalign, round up the size of the array to be a multiple of the page size.

Something similar to this:

printf("Registering memory \n");

  numbytes =(((int)(NBYTES)+4095)/4096)*4096;

  a_h = valloc(NBYTES);

  if ( cudaHostRegister(a_h, numbytes,0) !=0) printf( "cudaHostRegister(%d) failed\n",numbytes);

Thanks a lot. It works right now.

I post the right code here, in case somebody need.

#include <stdio.h>

#define SIZE 10

#include <cuda.h>

// Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide

__global__ void vecAdd(float* A, float* B, float* C) {

	// threadIdx.x is a built-in variable provided by CUDA at runtime

	int i = threadIdx.x;

	//	A[i] = 0;

	//	B[i] = i;

	C[i] = A[i] + B[i];

	printf("Kernel: A[%d]=%f, B[%d]=%f, C[%d]=%f\n", i, A[i], i, B[i], i, C[i]);

}

int main() {

	int N = SIZE;

//	round up the size of the array to be a multiple of the page size

	size_t memsize = ((SIZE * sizeof(float) + 4095) / 4096) * 4096;

	cudaDeviceProp deviceProp;

	// Get properties and verify device 0 supports mapped memory

	cudaGetDeviceProperties(&deviceProp, 0);

	if (!deviceProp.canMapHostMemory) {

		fprintf(stderr, "Device %d cannot map host memory!\n", 0);

		exit(EXIT_FAILURE);

	}

	// set the device flags for mapping host memory

	cudaSetDeviceFlags(cudaDeviceMapHost);

	float * A, *B, *C;

	float *devPtrA, *devPtrB, *devPtrC;

//	use valloc instead of malloc

	A = (float*) valloc(memsize);

	B = (float*) valloc(memsize);

	C = (float*) valloc(memsize);

	cudaHostRegister(A, memsize, cudaHostRegisterMapped);

	cudaHostRegister(B, memsize, cudaHostRegisterMapped);

	cudaHostRegister(C, memsize, cudaHostRegisterMapped);

	for (int i = 0; i < SIZE; i++) {

		A[i] = B[i] = i;

	}

	cudaHostGetDevicePointer((void **) &devPtrA, (void *) A, 0);

	cudaHostGetDevicePointer((void **) &devPtrB, (void *) B, 0);

	cudaHostGetDevicePointer((void **) &devPtrC, (void *) C, 0);

	vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);

	cudaDeviceSynchronize();

	for (int i = 0; i < SIZE; i++)

		printf("C[%d]=%f\n", i, C[i]);

	cudaHostUnregister(A);

	cudaHostUnregister(B);

	cudaHostUnregister(C);

	free(A);

	free(B);

	free(C);

}

I migrate this approach to my big project.
I don’t need to copy the data between host and device memory this time. But I find GPU kernel runs slower than before. That means the kernel access very slow on the pinned host memory by cudaHostRegister().
So if the kernel runs pinned memory very slow, what is the benefit of this approach? Does anybody get better performance after using this approach? Or what problem could be in my kernel?

BTW, my GPU kernel is just SpMV, it is faster than CUSPARSE lib’s spmv function. Is CUSPARSE lib 4.0 optimized for pinned memory ?

The kernel is expected to run slower with mapped host memory, is the PCIe link has both far lower bandwidth and higher latency as device memory.

The performance degradation is smallest if each memory location is only read once, and if the kernel can tolerate large latencies. In that case even a small performance benefit because the PCIe bandwidth is available in addition to the device memory bandwidth. But as the device memory bandwidth is so much larger than PCIe bandwidth, the potential benefit is small.

Mapped memory is mostly used for other reasons though (avoid copying large amounts of data if only few of those values are actually used, allow a single kernel to work on more memory than available on the device, overlap host<->device transfer with kernel execution, overlap host->device and device-> host transfer on non-Tesla devices).

In my project, there are thousands of iterations. In every iteration, I need to copy a vector (very huge) from host to device, and then calculate the kernel, and finally copy result back to host. After profiling, I find the memory copy takes even longer than kernel calculation. So I am trying to use some technique to avoid frequently memory copy. It seems like this approach does not applicable for my project.

How big is the vector?
Do you need all the elements or can you process just a fraction of them? If you could do the latter, then you can use double buffering.
If v(i) is a subset of the vector:

  1. send v(1)
  2. process v(1), while you are sending v(2)
  3. process v(2), while you are retrieving v(1) and sending v(3)

    i) process v(i), while you are retrieving v(i-1) and sending v(i+1)

    n) process v(n),while you are retrieving v(n-1)
    n+1) retrieve v(n)

In this way, you can hide most of the I/O time using streams. Step 3 assumes a card with dual DMA engines ( Tesla or Quadro).
You should pick the size of v(i) such that the processing for v(i) is equal to the transfer time for v(i-1) and/or v(i+1).