Question about CUDA_SAFE_CALL(cudaMemcpy(hostPx, CUDA_SAFE_CALL(cudaMemcpy(hostPx, device

annyli1998 · July 29, 2007, 11:28pm

I got some strang problem.

CUDA_SAFE_CALL(cudaMemcpy(hostPx, devicePx, sizeof(float) * ARRAYSIZE, cudaMemcpyDeviceToHost));

I used this fuction so many times, but this time I don’t know why it didn’t work.
In the EmuRealease mode, the code works very well, I got the right result.

But when I change to Release Mode, it seems it didn’t really copy the data from the device, it is still the same value as my initalization.

I am wondering if anybody can give me some clue to solve this problem.

I didn’t meet this problem before though I used it for several project already, I tried to compare it with all the sucessful project, but I didn’t see any difference.

:-)

Thanks a lot for any comments and suggestions.

Picknick3r · July 30, 2007, 10:24am

I got the same problem. i want to copy an integer array from device to host with cudaMemcpy. Everything is working fine in emu mode. But when I compile as a release nothing is copied. At first i suspected the debug mode and the CUDA_SAFE_CALL macro. But all it does (as far as I understand) is checking for errors and print them to stderr if something is wrong.

So I don’t know what else to do. Maybe its a general problem/bug in Cuda. So any help is appreciated.

paulius · July 31, 2007, 4:14am

Double check that your kernel completes succesfully and doesn’t crash.

Paulius

Picknick3r · August 2, 2007, 2:07pm

Sorry but I am really lost here. I still got the problem and checked everything for errors. Here are is my code. Am I doing something wrong???

unsigned int test_size = 256;

unsigned char * d_test= new unsigned char[test_size];

CUDA_SAFE_CALL(cudaMalloc((void**) &d_test, test_size*sizeof(char)));

...

cudaProcess<<<grid, block, sbytes>>>(d_test);

...

unsigned char* h_test = new unsigned char[test_size];

CUDA_SAFE_CALL(cudaMemcpy(h_test, d_test, test_size*sizeof(unsignedchar), cudaMemcpyDeviceToHost));

for (int i= 0; i<test_size; i++) {  

    	fprintf(stderr, " Test %d \n",h_test[i]);

}    	

...

__global__ void cudaProcess(unsigned char * g_test)

{	

    int thid = threadIdx.x;

    g_test[thid]=5;

}

Now h_test should be an array full of 5s, and that should be displayed. But I get random numbers here no matter if I modify g_test in the kernel or not.

Any idea or am I just too stupid.

Thx Picknick3r

paulius · August 2, 2007, 9:12pm

A couple of things.

you allocate memory for the pointer d_test twice - once on the host (with the new operator), once on the device (by calling cudaMalloc). Since you don’t call the delete operator in between, you end up wasting host memory.
Your cudaProcess code will not intialize the entire array to 5s, unless you launch only one block, and the block has 256 threads in the x-dimension. That’s because currently threads with same IDs but from different blocks will write to the same location. You can fix that for one-dimensional blocks with something like:

int thid = blockIdx.x*blockDim.x+threadIdx.x;

What are your grid, block, and sbytes values? When I run your (slightly modified) code, I get all 5s in the output. My modifications are:

chaning new to malloc (new is a C++ operator, in Win32 nvcc doesn’t seem to easily link it).
not allocating host memory for d_test.

Code is included below:

Paulius

#include <stdio.h>

#include <cutil.h>

#define _DEBUG

__global__ void cudaProcess(unsigned char * g_test)

{ 

   int thid = threadIdx.x;

   g_test[thid]=5;

}

int main()

{

    unsigned int test_size = 256;

    unsigned char *d_test=0;

    CUDA_SAFE_CALL(cudaMalloc((void**) &d_test, test_size*sizeof(unsigned char)));

   cudaProcess<<<1, 256, 0>>>(d_test);

   unsigned char *h_test = (unsigned char*)malloc(test_size*sizeof(unsigned char));

    CUDA_SAFE_CALL(cudaMemcpy(h_test, d_test, test_size*sizeof(unsigned char), cudaMemcpyDeviceToHost));

   for (int i= 0; i<test_size; i++)

    {  

        fprintf(stderr, " Test %d \n",h_test[i]);

    }

   free(h_test);

    CUDA_SAFE_CALL(cudaFree(d_test));

   return 1;

}

Picknick3r · August 3, 2007, 10:32am

That seems to be the root of all evil ;-)

Runs like a charme now. Thanks alot paulius you made my day.

Picknick3r

cxzhang · January 23, 2009, 4:11pm

I got a similar problem as described above, and my codes are as follows:

global void vecAdd( float *c, float *a, float *b )
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}

int main( int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);

unsigned int N = 128;
unsigned int memsz = N*sizeof(float);

float* h_A = (float*) malloc( memsz );
float* h_B = (float*) malloc( memsz );
h_A[0] = 1.0; h_A[1] = 2.0; h_A[2] = 3.0; h_A[3] = 4.0;
h_B[0] = 2.0; h_B[1] = 3.0; h_B[2] = 4.0; h_B[3] = 5.0;

float *d_A;
float *d_B;
float *d_C;
CUDA_SAFE_CALL( cudaMalloc((void **) &d_A, memsz) );
CUDA_SAFE_CALL( cudaMalloc((void **) &d_B, memsz) );
CUDA_SAFE_CALL( cudaMalloc((void **) &d_C, memsz) );

CUDA_SAFE_CALL( cudaMemcpy(d_A, h_A, memsz,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d_B, h_B, memsz,cudaMemcpyHostToDevice) );
                        
vecAdd<<<1, N>>>( d_C, d_A, d_B );

// check if kernel execution generated and error
CUT_CHECK_ERROR(“Kernel execution failed”);

float* h_C = (float*) malloc( memsz );
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy(h_C, d_C, memsz,cudaMemcpyDeviceToHost) );

for( int i = 0; i < 4; i++ )
fprintf(stderr, "%1.2f\t", h_C[i] );
	
// clean up memory
free(h_A);
free(h_B);
free(h_C);
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_C));

CUT_EXIT(argc, argv);

}

Topic		Replies	Views
cudaMemcpy CUDA Programming and Performance	3	8468	April 8, 2009
cudaMemcpy(..., cudaMemcpyDeviceToHost) not working? CUDA Programming and Performance	1	5084	November 16, 2009
about using "cudamemcpy" CUDA Programming and Performance	2	3289	June 5, 2008
cudaMemcpy() works in emu-mode; release-mode don't CUDA Programming and Performance	2	2344	May 24, 2007
n00b error with cudaMemcpy CUDA Programming and Performance	4	1063	June 30, 2010
Peculiar problem with cudaMemcpyToSymbol and CUDA_SAFE_CALL CUDA Programming and Performance	1	6564	July 17, 2009
cudaMemcpy freezes in emulator mode CUDA Programming and Performance	0	3962	May 1, 2009
cudaMemcpy not working? CUDA Programming and Performance	3	4380	May 27, 2009
bug in 1 block, 1 thread example. CUDA Programming and Performance	2	1109	March 11, 2010
Can't copy device memory to host memory CUDA Programming and Performance	2	3183	June 10, 2009

Question about CUDA_SAFE_CALL(cudaMemcpy(hostPx, CUDA_SAFE_CALL(cudaMemcpy(hostPx, device

Related topics