Hi all:
I am new to CUDA so maybe I am just making a trivial mistake. There is a strange behavior of cudaMalloc and/or cudaMemcpy. In the program attached below, I just allocated a large array on GPU global memory, set it to a fixed value using one thread per array element and then copy it back to CPU. The problem is that whenever the array size is larger than 128^3, e.g. 128^3+64 (to make the size multiple of CUDA_BLOCK_SIZE), then the elements after 128^3 will have strange values and the test will fail. In this test, the memory required is ~ 8 Mb and the grid size is 32769, so those seem not to be the limiting factor. Furthermore, there is no such problem if I run this code in deviceemu mode.
I tested this routine on both a Quadro 5600/Linux Red hat 5 and a GeForce 8600/MacBook Pro. They had the same results. Any thought is highly appreciated!
To compile the following program, type “nvcc -lcudart main.cu”.
main.cu:
#include <stdio.h>
#include <stdlib.h>
#define CUDA_BLOCK_SIZE 64
global void myKernel(float *ap, int size);
int main()
{
const int size = 128128128+64;
float ap;
if (cudaMalloc((void*)&ap, sizeof(float)*size) != cudaSuccess) {
printf(“cudaMalloc fail.\n”);
exit(0);
}
myKernel<<<size/CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE>>>(ap, size);
float bp = (float)malloc(sizeof(float)*size);
if (!bp) {
printf(“malloc fail.\n”);
exit(0);
}
if (cudaMemcpy(bp, ap, sizeof(float)*size, cudaMemcpyDeviceToHost) != cudaSuccess) {
printf(“cudaMemcpy failed.\n”);
exit(0);
}
for (int i = 0; i < size; i++)
if (fabs(bp[i] - 0.1) > 1e-5) printf(“fail at %d, “, i);
printf(”\n”);
free(bp);
cudaFree(ap);
}
global void myKernel(float *ap, int size)
{
const int tx = threadIdx.x;
const int bx = blockIdx.x;
int igrid = bx*CUDA_BLOCK_SIZE + tx;
if (igrid >= size)
return;
ap[igrid] = 0.1;
}