cudaMemcpy problem problem with using cudaMemcpy

Prometheus80 · October 12, 2009, 4:48pm

Hi,

I am facing a strange problem with cudaMemcpy. please take a look at the following code

dim3 gl_BlockSize(256);

dim3 gl_GridSize(782);

// not source of the problem

inline device host float3 Chartofloat3 (unsigned char a, unsigned char b, unsigned char c){

float x;

x = (float)(a/255.0);

float y;

y = (float)(b/255.0);

float z; //=c/255.0;

z = (float)(c/255.0);

return make_float3(x, y, z);

}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void CharTofloat3Image(unsigned char *a, float3 *out, int pixels){

int i = (blockIdx.x * blockDim.x)+ threadIdx.x;

	if(i < pixels){

	out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);

	}

}

void image4BTofloat3(unsigned char *src, float3 *out, int size){

unsigned char *d_uc1;

cudaMalloc ((void**)&d_uc1, size*sizeof(unsigned char));

std::cout << "image to f3 before MemcpyToDevice" << std::endl;

cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice);

std::cout << "image to f3 after MemcpyToDevice" << std::endl;

CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1,out,size/3);

cudaFree(d_uc1);

}

// not source of the problem

inline device host unsigned char capedfloat3Touchar(float a){

unsigned char x;

if (a < 0.0){x = 0;}

if (a > 1.0){x = 255;}

else {

	x = 100;

}

return x;

}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void f3ToucharImage(float3 *a, unsigned char *out, int pixels){

int i = (blockIdx.x * blockDim.x)+ threadIdx.x;	

if(i < pixels){

	float3 src = a[i];

	out[4*i] = capedfloat3Touchar(src.x);	

	out[(4*i)+1] = capedfloat3Touchar(src.y);

	out[(4*i)+2] = capedfloat3Touchar(src.z);

}

}

void f3ToImage4B(float3 *a, unsigned char *out, int size){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char));

f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);

cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost);	

cudaFree(d_uc);

std::cout << "f3touchar fertig" << std::endl;

}

int main(){

int size = 600000;

unsigned char *h_uc;

h_uc = (unsigned char*) malloc(size);

unsigned char *h_uc2;

h_uc2 = (unsigned char*) malloc(size);

float3 *d_f3;

cudaMalloc((void**)&d_f3, size*sizeof(float3));

for(int n = 0; n < 5; n++){

for (int i = 0; i < size; i++){h_uc[i]= 'a';}

image4BTofloat3(h_uc, d_f3,size);

f3ToImage4B(d_f3,h_uc2,size);

std::cout << "loop index: " << n << std::endl;

}

free(h_uc);

free(h_uc2);

cudaFree(d_f3);

}[/codebox]

the code runs perfect the first time, and then crashes because of

nvcc error : ‘./“a.out”’ died due to signal 11 (Invalid memory reference)

the second time

if you remove or comment either the cudaMemcpy in f3ToImage4B or image4BTofloat3

and/or

global void f3ToucharImage

global void CharTofloat3Image

the code dosen’t crash. I am thankfull for every advice

_teju · October 12, 2009, 5:02pm

Modify the function calls to:
void f3ToImage4B(float3 a, unsigned char* out, int size){
AND
void image4BTofloat3(unsigned char src, float3* out, int size){

ie. pass the address of the pointers, rather than just the pointers. This should fix your problems…

LSChien · October 13, 2009, 12:23am

not problem of cudaMemcpy but problem of out of array bound in kernel function

modify your kernel function as

void image4BTofloat3(unsigned char *src, float3 *out, int size)

{	

	unsigned char *d_uc1;	

	cutilSafeCall( cudaMalloc((void**)&d_uc1, size*sizeof(unsigned char)) );	

	std::cout << "image to f3 before MemcpyToDevice" << std::endl;	

	cutilSafeCall( cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice) );	

	std::cout << "image to f3 after MemcpyToDevice" << std::endl;	

	CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1, out, size/3);	

	cutilSafeCall( cudaFree(d_uc1) );

}

and

void f3ToImage4B(float3 *a, unsigned char *out, int size)

{	

	unsigned char *d_uc;	

	cutilSafeCall( cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char)) );	

	f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);	

	cutilSafeCall( cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost) );	

	cutilSafeCall( cudaFree(d_uc) );	

	std::cout << "f3touchar fertig" << std::endl;

}

Then you will have obtain error message.

The out of array bound comes from stride = 4 in your kernel

I cannot understand why you use stride = 4 in your kerel

__global__ void CharTofloat3Image(unsigned char *a, float3 *out, int pixels)

{	

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;		

	if(i < pixels){		

		out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);		

	}

}

why not use tride = 3 since type(out) = float3

__global__ void CharTofloat3Image(unsigned char *a, float3 *out, int pixels)

{	

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;		

	if(i < pixels){		

		out[i] = Chartofloat3(a[3*i], a[(3*i)+1], a[(3*i)+2]);		

	}

}

similarly, modify kernel “f3ToucharImage” as

__global__ void f3ToucharImage(float3 *a, unsigned char *out, int pixels)

{		

	int i = (blockIdx.x * blockDim.x)+ threadIdx.x;	

	if(i < pixels){		

		float3 src = a[i];		

		out[3*i] = capedfloat3Touchar(src.x);			

		out[(3*i)+1] = capedfloat3Touchar(src.y);		

		out[(3*i)+2] = capedfloat3Touchar(src.z);	

	}						

}

if you modify stride =3, then your kernel works

Prometheus80 · October 13, 2009, 2:13pm

@ teju

thanx for your suggestion, but it dosen’t solve the problem just gives you one looprun more, so it is probably the right direction.

@LSChien

thanx for your solution, it realy makes the posted code run perfect. the posted code is just a little extraction of a bigger project, and when i try to apply your solution to the real project i surprisingly get the same old error

to your question why i use a stride of 4 and not of three.

i have to expand a programm written by somebody else. its all about evolutioionary imageprocessing. the chararry represents an image, and the base version of the programm uses 4 uchar to store one pixel, of witch i need just the first 3 of every pixel

new aproach to the problem:

can somebody tell me a reason why the following pice of code could possibly result in a crash(invalid memory accses), and why it does so just after the second time the code is executed, and why qorking with double pointers results in crashin the third time?

[codebox]void image4BTofloat3(unsigned char *src, float3 *out, int size){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc,size);

cudaMemcpy (d_uc, src,size, cudaMemcpyHostToDevice);

cudaFree(d_uc);

}[/codebox]

LSChien · October 13, 2009, 3:06pm

[quot]

the posted code is just a little extraction of a bigger project, and when i try to apply your solution to the real project

i surprisingly get the same old error

[/quot]

I think that you need to

(1) find out which kernel fails and

(2) check if out of array bound occurs in the kernel

sometimes it does not report error if you run out of array bound, but sometimes it would.

second, the host function “image4BTofloat3” does not invoke any kernel code,

void image4BTofloat3(unsigned char *src, float3 *out, int size)

{	

	unsigned char *d_uc;	

	cudaMalloc ((void**)&d_uc,size);	

	cudaMemcpy (d_uc, src,size, cudaMemcpyHostToDevice);	

	cudaFree(d_uc);

}

I cannot understand “why the following pice of code could possibly result in a crash(invalid memory accses)”.

Prometheus80 · October 13, 2009, 4:00pm

@LSChien
thanx again for your fast and precise help.
three things:
i am aware that the latest posted code does not call a kernel, it resuts in a crash anyway.
i am not a native english speacker, and i am not sure what “out of array bond” means exactly. does it describe the situation that a kernel or funktion uses a memory area it is not supposed to use?
the “big” programm run without crashing if you remove the cudamemcpys

one more question
do you have any idea why it crashes the second time you try to execute the code, or the third time when you pass double pointer instead od normal pointers?

Letharion · October 14, 2009, 7:59am

i am not a native english speacker, and i am not sure what “out of array bond” means exactly. does it describe the situation that a kernel or funktion uses a memory area it is not supposed to use?
Yes.
int a[2] = {0,0};
a[0] = 1; //Ok
a[1] = 2; //Ok
a[2] = 3; //Out of bounds. Bounds == Limits, if you understand that better. You have written outside of the arrays. If you’re lucky, this attempts to write to someone elses memory, and your program dies immediately. If you’re unlucky, you just wrote to adress of another of your variables. Chaos likely ensues.

Prometheus80 · October 14, 2009, 8:50am

I found a surprising solution: the following code works and produces the right results(and does not crash of course), if you add the cuda free it crashes the second time.

[codebox]void f3ToImage4B(float3 *a, Image4B *out){

unsigned char *d_uc;

cudaMalloc ((void**)&d_uc, out->size()*sizeof(unsigned char));

f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,gl_Pixels);

dim3 grids = GetGridSize(gl_BlockSize,out->size());

uChar3TouChar4<<<grids ,gl_BlockSize>>>(d_uc,d_uc);

cudaMemcpy(out->charBuffer(),d_uc,out->size(),cudaMemcpyDeviceToHost);

//cudaFree(d_uc);

}[/codebox]

I am aware that it is unlikely that the cudaFree comand really causes this sort of trouble, and the root of all evil is probably somewhere else. Allthough the problem is (superficialy) solved I am still curious whats going on. Any speculations will be read with high interesst. Thanx @all who took time to help me.

LSChien · October 14, 2009, 9:08am

“out of array bond” means that you access invalid array element.

for example, if you have array A[0:10], but in your kernel, if you access A[11], then it is “out of array bond”.

At this time, your kernel may crash.

I think that there may be hidden bug in your code,

there should not be any error if you use cudaFree(d_uc).

try following code, find location of error.

#include <cuda_runtime_api.h>

	#include <cutil.h>

	#include <cutil_inline.h>

void f3ToImage4B(float3 *a, Image4B *out)

{	

	unsigned char *d_uc;	

	cutilSafeCall( cudaMalloc ((void**)&d_uc, out->size()*sizeof(unsigned char)) );	

	f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,gl_Pixels);	

	cutilCheckMsg("Kernel execution failed");

	dim3 grids = GetGridSize(gl_BlockSize,out->size());	

	uChar3TouChar4<<<grids ,gl_BlockSize>>>(d_uc,d_uc);	

	cutilCheckMsg("Kernel execution failed");

	cutilSafeCall( cudaMemcpy(out->charBuffer(),d_uc,out->size(),cudaMemcpyDeviceToHost) );	

	cutilSafeCall( cudaFree(d_uc) );

}

Prometheus80 · October 14, 2009, 12:19pm

How can I use
#include <cuda_runtime_api.h>
#include <cutil.h>
#include <cutil_inline.h>

when I try to include them the compiler complains that he cannot find them. I am working under Linux

biebo · October 14, 2009, 1:00pm

what is you makefile …

any ways imy make file is …

SDK_ROOT = /home/bibrak/NVIDIA_CUDA_SDK
CUDA_LIB_PATH = /usr/local/cuda/lib

INLCUDE = -I$(SDK_ROOT)/common/inc -I/usr/local/cuda/include

LIBS = -L$(SDK_ROOT)/lib -lcutil
LIBS += -L$(CUDA_LIB_PATH) -lcudart -lcuda

SRC_CU = test.cu

SRC_CXX =

all:
nvcc -run $(INLCUDE) $(LIBS) $(SRC_CU) $(SRC_CXX)

here test.cu will be your source file.

/home/bibrak —> will be your path were NVIDIA_CUDA_SDK is

i hope this might help

LSChien · October 14, 2009, 1:07pm

I take SDK 2.3 as an example

cutil.h and cutil_inline.h are in directory /usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc

you need to link with library /usr/local/NVIDIA_GPU_Computing_SDK/C/lib/libcutil.a

cuda_runtime_api.h is in directory /usr/local/cuda/include

n00b4Life · January 5, 2010, 4:35am

I am not sure if this problem was ever solved, but I encountered something very similar. The previous advice from LS is correct, there is most likely an out of bounds memory access. I am not sure why CUDA does not throw an error, but I was clearly accessing data out of bounds. The problem was the out of bounds access did not corrupt any of my other data. The only indication I had that something went wrong was that cudaFree took nearly one second to process.

Hopefully this is fixed in CUDA 3.0 or perhaps in OpenCL??

Happy hacking …

-Mike

Hi,

I am facing a strange problem with cudaMemcpy. please take a look at the following code

[codebox]include

dim3 gl_BlockSize(256);

dim3 gl_GridSize(782);

// not source of the problem

inline device host float3 Chartofloat3 (unsigned char a, unsigned char b, unsigned char c){
float x;

x = (float)(a/255.0);

float y;

y = (float)(b/255.0);

float z; //=c/255.0;

z = (float)(c/255.0);

return make_float3(x, y, z);
}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void CharTofloat3Image(unsigned char *a, float3 *out, int pixels){
int i = (blockIdx.x * blockDim.x)+ threadIdx.x;

	if(i < pixels){

	out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);

	}
}

void image4BTofloat3(unsigned char *src, float3 *out, int size){
unsigned char *d_uc1;

cudaMalloc ((void**)&d_uc1, size*sizeof(unsigned char));

std::cout << "image to f3 before MemcpyToDevice" << std::endl;

cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice);

std::cout << "image to f3 after MemcpyToDevice" << std::endl;

CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1,out,size/3);

cudaFree(d_uc1);
}

// not source of the problem

inline device host unsigned char capedfloat3Touchar(float a){
unsigned char x;

if (a < 0.0){x = 0;}

if (a > 1.0){x = 255;}

else {

	x = 100;

}

return x;	
}

// seems to cause problems when executed the second time in combination with cudaMemcpy

global void f3ToucharImage(float3 *a, unsigned char *out, int pixels){
int i = (blockIdx.x * blockDim.x)+ threadIdx.x;	

if(i < pixels){

	float3 src = a[i];

	out[4*i] = capedfloat3Touchar(src.x);	

	out[(4*i)+1] = capedfloat3Touchar(src.y);

	out[(4*i)+2] = capedfloat3Touchar(src.z);

}
}

void f3ToImage4B(float3 *a, unsigned char *out, int size){
unsigned char *d_uc;

cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char));

f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);

cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost);	

cudaFree(d_uc);

std::cout << "f3touchar fertig" << std::endl;
}

int main(){
int size = 600000;

unsigned char *h_uc;

h_uc = (unsigned char*) malloc(size);

unsigned char *h_uc2;

h_uc2 = (unsigned char*) malloc(size);

float3 *d_f3;

cudaMalloc((void**)&d_f3, size*sizeof(float3));

for(int n = 0; n < 5; n++){

for (int i = 0; i < size; i++){h_uc[i]= 'a';}

image4BTofloat3(h_uc, d_f3,size);

f3ToImage4B(d_f3,h_uc2,size);

std::cout << "loop index: " << n << std::endl;

}

free(h_uc);

free(h_uc2);

cudaFree(d_f3);
}[/codebox]

the code runs perfect the first time, and then crashes because of

nvcc error : ‘./“a.out”’ died due to signal 11 (Invalid memory reference)

the second time

if you remove or comment either the cudaMemcpy in f3ToImage4B or image4BTofloat3

and/or

global void f3ToucharImage

global void CharTofloat3Image

the code dosen’t crash. I am thankfull for every advice

Topic		Replies	Views
cudaMemcpy() returns success and copy incorrect data CUDA Programming and Performance	3	2134	March 4, 2017
cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess CUDA Programming and Performance	5	15213	August 7, 2019
CUDA 2.1 beta CUDA Programming and Performance	49	67161	December 3, 2008
strange error in summation memory problems CUDA Programming and Performance	14	16609	July 21, 2010
Troubles converting float array to float2 array CUDA Programming and Performance	11	3519	July 3, 2017
Getting an Error Using CudaMalloc3d CUDA Programming and Performance	10	2476	December 10, 2015
starting cuda function from a .cc file CUDA Programming and Performance	17	6981	January 3, 2010
Concurrent Kernel executions & Data Transfers CUDA Programming and Performance cuda	3	597	March 8, 2023
Second call to cudaMemcpy3D crashes my app First call is successful CUDA Programming and Performance	14	8807	November 30, 2010
Error when attempting to use cudaMemCpy() CUDA Programming and Performance	5	53	August 9, 2024

cudaMemcpy problem problem with using cudaMemcpy

any ways imy make file is …

Related topics