cudaMemcpy problem problem with using cudaMemcpy

Hi,

I am facing a strange problem with cudaMemcpy. please take a look at the following code

[codebox]#include

dim3 gl_BlockSize(256);

dim3 gl_GridSize(782);

// not source of the problem

inline device host float3 Chartofloat3 (unsigned char a, unsigned char b, unsigned char c){

float x;

x = (float)(a/255.0);

float y;

y = (float)(b/255.0);

float z; //=c/255.0;

z = (float)(c/255.0);

return make_float3(x, y, z);

}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void CharTofloat3Image(unsigned char *a, float3 *out, int pixels){

int i = (blockIdx.x * blockDim.x)+ threadIdx.x;

	if(i < pixels){

	out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);

	}

}

void image4BTofloat3(unsigned char *src, float3 *out, int size){

unsigned char *d_uc1;

cudaMalloc ((void**)&d_uc1, size*sizeof(unsigned char));

std::cout << "image to f3 before MemcpyToDevice" << std::endl;

cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice);

std::cout << "image to f3 after MemcpyToDevice" << std::endl;

CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1,out,size/3);

cudaFree(d_uc1);

}

// not source of the problem

inline device host unsigned char capedfloat3Touchar(float a){

unsigned char x;

if (a < 0.0){x = 0;}

if (a > 1.0){x = 255;}

else {

	x = 100;

}

return x;	

}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void f3ToucharImage(float3 *a, unsigned char *out, int pixels){

int i = (blockIdx.x * blockDim.x)+ threadIdx.x;	

if(i < pixels){

	float3 src = a[i];

	out[4*i] = capedfloat3Touchar(src.x);	

	out[(4*i)+1] = capedfloat3Touchar(src.y);

	out[(4*i)+2] = capedfloat3Touchar(src.z);

}

}

void f3ToImage4B(float3 *a, unsigned char *out, int size){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char));

f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);

cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost);	

cudaFree(d_uc);

std::cout << "f3touchar fertig" << std::endl;

}

int main(){

int size = 600000;

unsigned char *h_uc;

h_uc = (unsigned char*) malloc(size);

unsigned char *h_uc2;

h_uc2 = (unsigned char*) malloc(size);

float3 *d_f3;

cudaMalloc((void**)&d_f3, size*sizeof(float3));

for(int n = 0; n < 5; n++){

for (int i = 0; i < size; i++){h_uc[i]= 'a';}

image4BTofloat3(h_uc, d_f3,size);

f3ToImage4B(d_f3,h_uc2,size);

std::cout << "loop index: " << n << std::endl;

}

free(h_uc);

free(h_uc2);

cudaFree(d_f3);

}[/codebox]

the code runs perfect the first time, and then crashes because of

nvcc error : ‘./“a.out”’ died due to signal 11 (Invalid memory reference)

the second time

if you remove or comment either the cudaMemcpy in f3ToImage4B or image4BTofloat3

and/or

global void f3ToucharImage

global void CharTofloat3Image

the code dosen’t crash. I am thankfull for every advice

Modify the function calls to:
void f3ToImage4B(float3 a, unsigned char* out, int size){
AND
void image4BTofloat3(unsigned char src, float3* out, int size){

ie. pass the address of the pointers, rather than just the pointers. This should fix your problems…

not problem of cudaMemcpy but problem of out of array bound in kernel function

modify your kernel function as

void image4BTofloat3(unsigned char *src, float3 *out, int size)

{	

	unsigned char *d_uc1;	

	cutilSafeCall( cudaMalloc((void**)&d_uc1, size*sizeof(unsigned char)) );	

	std::cout << "image to f3 before MemcpyToDevice" << std::endl;	

	cutilSafeCall( cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice) );	

	std::cout << "image to f3 after MemcpyToDevice" << std::endl;	

	CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1, out, size/3);	

	cutilSafeCall( cudaFree(d_uc1) );

}

and

void f3ToImage4B(float3 *a, unsigned char *out, int size)

{	

	unsigned char *d_uc;	

	cutilSafeCall( cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char)) );	

	f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);	

	cutilSafeCall( cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost) );	

	cutilSafeCall( cudaFree(d_uc) );	

	std::cout << "f3touchar fertig" << std::endl;

}

Then you will have obtain error message.

The out of array bound comes from stride = 4 in your kernel

I cannot understand why you use stride = 4 in your kerel

__global__ void CharTofloat3Image(unsigned char *a, float3 *out, int pixels)

{	

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;		

	if(i < pixels){		

		out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);		

	}

}

why not use tride = 3 since type(out) = float3

__global__ void CharTofloat3Image(unsigned char *a, float3 *out, int pixels)

{	

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;		

	if(i < pixels){		

		out[i] = Chartofloat3(a[3*i], a[(3*i)+1], a[(3*i)+2]);		

	}

}

similarly, modify kernel “f3ToucharImage” as

__global__ void f3ToucharImage(float3 *a, unsigned char *out, int pixels)

{		

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;	

	if(i < pixels){		

		float3 src = a[i];		

		out[3*i] = capedfloat3Touchar(src.x);			

		out[(3*i)+1] = capedfloat3Touchar(src.y);		

		out[(3*i)+2] = capedfloat3Touchar(src.z);	

	}						

}

if you modify stride =3, then your kernel works

@ teju

thanx for your suggestion, but it dosen’t solve the problem just gives you one looprun more, so it is probably the right direction.

@LSChien

thanx for your solution, it realy makes the posted code run perfect. the posted code is just a little extraction of a bigger project, and when i try to apply your solution to the real project i surprisingly get the same old error

to your question why i use a stride of 4 and not of three.

i have to expand a programm written by somebody else. its all about evolutioionary imageprocessing. the chararry represents an image, and the base version of the programm uses 4 uchar to store one pixel, of witch i need just the first 3 of every pixel

new aproach to the problem:

can somebody tell me a reason why the following pice of code could possibly result in a crash(invalid memory accses), and why it does so just after the second time the code is executed, and why qorking with double pointers results in crashin the third time?

[codebox]void image4BTofloat3(unsigned char *src, float3 *out, int size){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc,size);

cudaMemcpy (d_uc, src,size, cudaMemcpyHostToDevice);

cudaFree(d_uc);

}[/codebox]

[quot]

the posted code is just a little extraction of a bigger project, and when i try to apply your solution to the real project

i surprisingly get the same old error

[/quot]

I think that you need to

(1) find out which kernel fails and

(2) check if out of array bound occurs in the kernel

sometimes it does not report error if you run out of array bound, but sometimes it would.

second, the host function “image4BTofloat3” does not invoke any kernel code,

void image4BTofloat3(unsigned char *src, float3 *out, int size)

{	

	unsigned char *d_uc;	

	cudaMalloc ((void**)&d_uc,size);	

	cudaMemcpy (d_uc, src,size, cudaMemcpyHostToDevice);	

	cudaFree(d_uc);

}

I cannot understand “why the following pice of code could possibly result in a crash(invalid memory accses)”.

@LSChien
thanx again for your fast and precise help.
three things:
i am aware that the latest posted code does not call a kernel, it resuts in a crash anyway.
i am not a native english speacker, and i am not sure what “out of array bond” means exactly. does it describe the situation that a kernel or funktion uses a memory area it is not supposed to use?
the “big” programm run without crashing if you remove the cudamemcpys

one more question
do you have any idea why it crashes the second time you try to execute the code, or the third time when you pass double pointer instead od normal pointers?

i am not a native english speacker, and i am not sure what “out of array bond” means exactly. does it describe the situation that a kernel or funktion uses a memory area it is not supposed to use?
Yes.
int a[2] = {0,0};
a[0] = 1; //Ok
a[1] = 2; //Ok
a[2] = 3; //Out of bounds. Bounds == Limits, if you understand that better. You have written outside of the arrays. If you’re lucky, this attempts to write to someone elses memory, and your program dies immediately. If you’re unlucky, you just wrote to adress of another of your variables. Chaos likely ensues.

I found a surprising solution: the following code works and produces the right results(and does not crash of course), if you add the cuda free it crashes the second time.

[codebox]void f3ToImage4B(float3 *a, Image4B *out){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc, out->size()*sizeof(unsigned char));

f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,gl_Pixels);

dim3 grids = GetGridSize(gl_BlockSize,out->size());

uChar3TouChar4<<<grids ,gl_BlockSize>>>(d_uc,d_uc);

cudaMemcpy(out->charBuffer(),d_uc,out->size(),cudaMemcpyDeviceToHost);

//cudaFree(d_uc);

}[/codebox]

I am aware that it is unlikely that the cudaFree comand really causes this sort of trouble, and the root of all evil is probably somewhere else. Allthough the problem is (superficialy) solved I am still curious whats going on. Any speculations will be read with high interesst. Thanx @all who took time to help me.

“out of array bond” means that you access invalid array element.

for example, if you have array A[0:10], but in your kernel, if you access A[11], then it is “out of array bond”.

At this time, your kernel may crash.

I think that there may be hidden bug in your code,

there should not be any error if you use cudaFree(d_uc).

try following code, find location of error.

#include <cuda_runtime_api.h>

	#include <cutil.h>

	#include <cutil_inline.h>

void f3ToImage4B(float3 *a, Image4B *out)

{	

	unsigned char *d_uc;	

	cutilSafeCall( cudaMalloc ((void**)&d_uc, out->size()*sizeof(unsigned char)) );	

	f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,gl_Pixels);	

	cutilCheckMsg("Kernel execution failed");

	dim3 grids = GetGridSize(gl_BlockSize,out->size());	

	uChar3TouChar4<<<grids ,gl_BlockSize>>>(d_uc,d_uc);	

	cutilCheckMsg("Kernel execution failed");

	cutilSafeCall( cudaMemcpy(out->charBuffer(),d_uc,out->size(),cudaMemcpyDeviceToHost) );	

	cutilSafeCall( cudaFree(d_uc) );

}

How can I use
#include <cuda_runtime_api.h>
#include <cutil.h>
#include <cutil_inline.h>

when I try to include them the compiler complains that he cannot find them. I am working under Linux

what is you makefile …

any ways imy make file is …

SDK_ROOT = /home/bibrak/NVIDIA_CUDA_SDK
CUDA_LIB_PATH = /usr/local/cuda/lib

INLCUDE = -I$(SDK_ROOT)/common/inc -I/usr/local/cuda/include

LIBS = -L$(SDK_ROOT)/lib -lcutil
LIBS += -L$(CUDA_LIB_PATH) -lcudart -lcuda

SRC_CU = test.cu

SRC_CXX =

all:
nvcc -run $(INLCUDE) $(LIBS) $(SRC_CU) $(SRC_CXX)


here test.cu will be your source file.

/home/bibrak —> will be your path were NVIDIA_CUDA_SDK is

i hope this might help

I take SDK 2.3 as an example

cutil.h and cutil_inline.h are in directory /usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc

you need to link with library /usr/local/NVIDIA_GPU_Computing_SDK/C/lib/libcutil.a

cuda_runtime_api.h is in directory /usr/local/cuda/include

I am not sure if this problem was ever solved, but I encountered something very similar. The previous advice from LS is correct, there is most likely an out of bounds memory access. I am not sure why CUDA does not throw an error, but I was clearly accessing data out of bounds. The problem was the out of bounds access did not corrupt any of my other data. The only indication I had that something went wrong was that cudaFree took nearly one second to process.

Hopefully this is fixed in CUDA 3.0 or perhaps in OpenCL??

Happy hacking …

-Mike