error in using cuda mapped memory a test program for mapped memory

Hi, guys,

I wrote a test program for the use of mapped memory. However, the it failed and I don’t know why.

Here is the source code:

#include "cuda_runtime.h"

#include <stdlib.h>

#include <stdio.h>

int main(void) {

	unsigned int *h_array, *d_array, *h_array_test;

	int cudaError;

	int num = 5;

	int size = num * sizeof(unsigned int);

	cudaSetDevice(0);

	// set device flag

	cudaSetDeviceFlags( cudaDeviceMapHost );

	// allocate pinned memory

	cudaError=cudaHostAlloc( (void**) &(h_array), size, cudaHostAllocMapped );

	if (cudaError) 

		printf ("Failed to allocate pinned memory \n");

	// get device ptr

	cudaError=cudaHostGetDevicePointer( (void**) &(d_array), h_array, 0 );

	if (cudaError) 

		printf ("Failed to get device pointer \n");

	// initialize test data

	for (int i = 0; i < num; i ++)

	{

		h_array[i] = i;

	}

	// output array in host memory

	printf("array in host memory:\n");

	for (int i = 0; i < num; i ++)

	{

		printf("%d: %d\n", i, h_array[i]);

	}

	printf("\n");

	

	// output array in device memory

	//cudaMemcpy(d_array, h_array, size, cudaMemcpyHostToDevice);

	h_array_test = (unsigned int*)malloc(size);

	cudaError=cudaMemcpy( h_array_test, d_array, size, cudaMemcpyDeviceToHost );

	if (cudaError) 

		printf ("Failed to copy device memory \n");

	printf("array in device memory:\n");

	for (int i = 0; i < num; i ++)

	{

		printf("%d: %d\n", i, h_array_test[i]);

	}

	printf("\n");

}

And the result is as follows:

array in host memory:

0: 0

1: 1

2: 2

3: 3

4: 4

Failed to copy device memory

array in device memory:

0: -1163005939

1: -1163005939

2: -1163005939

3: -1163005939

4: -1163005939

Apparently, the mapping was not sucessful. Anyone can help me out? Lots of thanks!

BTW: my GPU is Quadro FX3800 and the compute capability is 1.3, which supports page-locked memory mapping

There is an example in sdk with using pinned memory. Does it work on your system?

Yes, I just tried simpleZeroCopy, it works! So there must be sth wrong with my program. Still cannot find out :verymad:

I just modified my code a little bit, by adding a kernel call which utilizes the mapped memory. This time it works!

__global__ void modifyArray(unsigned int *arr, int N)

{

	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	if (idx < N)

		arr[idx] = idx;

}

int main(void) {

	unsigned int *h_array, *d_array;

	int cudaError;

	int num = 5;

	int size = num * sizeof(unsigned int);

	cudaSetDevice(0);

	// set device flag

	cudaSetDeviceFlags( cudaDeviceMapHost );

	// allocate pinned memory

	cudaError=cudaHostAlloc( (void**) &(h_array), size, cudaHostAllocMapped );

	if (cudaError) 

		printf ("Failed to allocate pinned memory \n");

	// get device ptr

	cudaError=cudaHostGetDevicePointer( (void**) &(d_array), (void*)h_array, 0 );

	if (cudaError) 

		printf ("Failed to get device pointer \n");

	// initialize test data

	memset(h_array, 0, size);

	// output array in host memory

	printf("array in host memory:\n");

	for (int i = 0; i < num; i ++)

	{

		printf("%d: %d\n", i, h_array[i]);

	}

	printf("\n");

	// call kernel

	dim3 grid, block;

	grid.x = 1;

	block.x = num;

	modifyArray<<<grid,block>>>(d_array, num);

	cutilSafeCall(cudaThreadSynchronize());

	// output array in device memory

	printf("array in host memory after kernel call:\n");

	for (int i = 0; i < num; i ++)

	{

		printf("%d: %d\n", i, h_array[i]);

	}

	printf("\n");

The result is:

array in host memory:

0: 0

1: 0

2: 0

3: 0

4: 0

array in host memory after kernel call:

0: 0

1: 1

2: 2

3: 3

4: 4

Does this mean: only the kernel call would initiate the automatic data copy between the device and the host?