Question about CUDA_SAFE_CALL(cudaMemcpy(hostPx, CUDA_SAFE_CALL(cudaMemcpy(hostPx, device

I got some strang problem.

CUDA_SAFE_CALL(cudaMemcpy(hostPx, devicePx, sizeof(float) * ARRAYSIZE, cudaMemcpyDeviceToHost));

I used this fuction so many times, but this time I don’t know why it didn’t work.
In the EmuRealease mode, the code works very well, I got the right result.

But when I change to Release Mode, it seems it didn’t really copy the data from the device, it is still the same value as my initalization.

I am wondering if anybody can give me some clue to solve this problem.

I didn’t meet this problem before though I used it for several project already, I tried to compare it with all the sucessful project, but I didn’t see any difference.

:-)

Thanks a lot for any comments and suggestions.

I got the same problem. i want to copy an integer array from device to host with cudaMemcpy. Everything is working fine in emu mode. But when I compile as a release nothing is copied. At first i suspected the debug mode and the CUDA_SAFE_CALL macro. But all it does (as far as I understand) is checking for errors and print them to stderr if something is wrong.

So I don’t know what else to do. Maybe its a general problem/bug in Cuda. So any help is appreciated.

Double check that your kernel completes succesfully and doesn’t crash.

Paulius

Sorry but I am really lost here. I still got the problem and checked everything for errors. Here are is my code. Am I doing something wrong???

unsigned int test_size = 256;

unsigned char * d_test= new unsigned char[test_size];

CUDA_SAFE_CALL(cudaMalloc((void**) &d_test, test_size*sizeof(char)));

...

cudaProcess<<<grid, block, sbytes>>>(d_test);

...

unsigned char* h_test = new unsigned char[test_size];

CUDA_SAFE_CALL(cudaMemcpy(h_test, d_test, test_size*sizeof(unsignedchar), cudaMemcpyDeviceToHost));

for (int i= 0; i<test_size; i++) {  

    	fprintf(stderr, " Test %d \n",h_test[i]);

}    	

...

__global__ void cudaProcess(unsigned char * g_test)

{	

    int thid = threadIdx.x;

    g_test[thid]=5;

}

Now h_test should be an array full of 5s, and that should be displayed. But I get random numbers here no matter if I modify g_test in the kernel or not.

Any idea or am I just too stupid.

Thx Picknick3r

A couple of things.

  1. you allocate memory for the pointer d_test twice - once on the host (with the new operator), once on the device (by calling cudaMalloc). Since you don’t call the delete operator in between, you end up wasting host memory.

  2. Your cudaProcess code will not intialize the entire array to 5s, unless you launch only one block, and the block has 256 threads in the x-dimension. That’s because currently threads with same IDs but from different blocks will write to the same location. You can fix that for one-dimensional blocks with something like:

int thid = blockIdx.x*blockDim.x+threadIdx.x;

What are your grid, block, and sbytes values? When I run your (slightly modified) code, I get all 5s in the output. My modifications are:

  • chaning new to malloc (new is a C++ operator, in Win32 nvcc doesn’t seem to easily link it).

  • not allocating host memory for d_test.

Code is included below:

Paulius

#include <stdio.h>

#include <cutil.h>

#define _DEBUG

__global__ void cudaProcess(unsigned char * g_test)

{ 

   int thid = threadIdx.x;

   g_test[thid]=5;

}

int main()

{

    unsigned int test_size = 256;

    unsigned char *d_test=0;

    CUDA_SAFE_CALL(cudaMalloc((void**) &d_test, test_size*sizeof(unsigned char)));

   cudaProcess<<<1, 256, 0>>>(d_test);

   unsigned char *h_test = (unsigned char*)malloc(test_size*sizeof(unsigned char));

    CUDA_SAFE_CALL(cudaMemcpy(h_test, d_test, test_size*sizeof(unsigned char), cudaMemcpyDeviceToHost));

   for (int i= 0; i<test_size; i++)

    {  

        fprintf(stderr, " Test %d \n",h_test[i]);

    }

   free(h_test);

    CUDA_SAFE_CALL(cudaFree(d_test));

   return 1;

}

That seems to be the root of all evil ;-)

Runs like a charme now. Thanks alot paulius you made my day.

Picknick3r

I got a similar problem as described above, and my codes are as follows:

global void vecAdd( float *c, float *a, float *b )
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}

int main( int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);

unsigned int N = 128;
unsigned int memsz = N*sizeof(float);

float* h_A = (float*) malloc( memsz );
float* h_B = (float*) malloc( memsz );
h_A[0] = 1.0; h_A[1] = 2.0; h_A[2] = 3.0; h_A[3] = 4.0;
h_B[0] = 2.0; h_B[1] = 3.0; h_B[2] = 4.0; h_B[3] = 5.0;

float *d_A;
float *d_B;
float *d_C;
CUDA_SAFE_CALL( cudaMalloc((void **) &d_A, memsz) );
CUDA_SAFE_CALL( cudaMalloc((void **) &d_B, memsz) );
CUDA_SAFE_CALL( cudaMalloc((void **) &d_C, memsz) );

CUDA_SAFE_CALL( cudaMemcpy(d_A, h_A, memsz,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d_B, h_B, memsz,cudaMemcpyHostToDevice) );
                        
vecAdd<<<1, N>>>( d_C, d_A, d_B );

// check if kernel execution generated and error
CUT_CHECK_ERROR(“Kernel execution failed”);

float* h_C = (float*) malloc( memsz );
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy(h_C, d_C, memsz,cudaMemcpyDeviceToHost) );

for( int i = 0; i < 4; i++ )
fprintf(stderr, "%1.2f\t", h_C[i] );
	
// clean up memory
free(h_A);
free(h_B);
free(h_C);
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_C));

CUT_EXIT(argc, argv);

}