Hi,
I am facing a strange problem with cudaMemcpy. please take a look at the following code
[codebox]#include
dim3 gl_BlockSize(256);
dim3 gl_GridSize(782);
// not source of the problem
inline device host float3 Chartofloat3 (unsigned char a, unsigned char b, unsigned char c){
float x;
x = (float)(a/255.0);
float y;
y = (float)(b/255.0);
float z; //=c/255.0;
z = (float)(c/255.0);
return make_float3(x, y, z);
}
// seems to cause problems when executed the second time in combination with cudaMemcpy
global void CharTofloat3Image(unsigned char *a, float3 *out, int pixels){
int i = (blockIdx.x * blockDim.x)+ threadIdx.x;
if(i < pixels){
out[i] = Chartofloat3(a[4*i], a[(4*i)+1], a[(4*i)+2]);
}
}
void image4BTofloat3(unsigned char *src, float3 *out, int size){
unsigned char *d_uc1;
cudaMalloc ((void**)&d_uc1, size*sizeof(unsigned char));
std::cout << "image to f3 before MemcpyToDevice" << std::endl;
cudaMemcpy (d_uc1,src,size*sizeof(unsigned char), cudaMemcpyHostToDevice);
std::cout << "image to f3 after MemcpyToDevice" << std::endl;
CharTofloat3Image<<<gl_GridSize,gl_BlockSize>>>(d_uc1,out,size/3);
cudaFree(d_uc1);
}
// not source of the problem
inline device host unsigned char capedfloat3Touchar(float a){
unsigned char x;
if (a < 0.0){x = 0;}
if (a > 1.0){x = 255;}
else {
x = 100;
}
return x;
}
// seems to cause problems when executed the second time in combination with cudaMemcpy
global void f3ToucharImage(float3 *a, unsigned char *out, int pixels){
int i = (blockIdx.x * blockDim.x)+ threadIdx.x;
if(i < pixels){
float3 src = a[i];
out[4*i] = capedfloat3Touchar(src.x);
out[(4*i)+1] = capedfloat3Touchar(src.y);
out[(4*i)+2] = capedfloat3Touchar(src.z);
}
}
void f3ToImage4B(float3 *a, unsigned char *out, int size){
unsigned char *d_uc;
cudaMalloc ((void**)&d_uc,size*sizeof(unsigned char));
f3ToucharImage<<<gl_GridSize,gl_BlockSize>>>(a,d_uc,size/3);
cudaMemcpy (out, d_uc,size*sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_uc);
std::cout << "f3touchar fertig" << std::endl;
}
int main(){
int size = 600000;
unsigned char *h_uc;
h_uc = (unsigned char*) malloc(size);
unsigned char *h_uc2;
h_uc2 = (unsigned char*) malloc(size);
float3 *d_f3;
cudaMalloc((void**)&d_f3, size*sizeof(float3));
for(int n = 0; n < 5; n++){
for (int i = 0; i < size; i++){h_uc[i]= 'a';}
image4BTofloat3(h_uc, d_f3,size);
f3ToImage4B(d_f3,h_uc2,size);
std::cout << "loop index: " << n << std::endl;
}
free(h_uc);
free(h_uc2);
cudaFree(d_f3);
}[/codebox]
the code runs perfect the first time, and then crashes because of
nvcc error : ‘./“a.out”’ died due to signal 11 (Invalid memory reference)
the second time
if you remove or comment either the cudaMemcpy in f3ToImage4B or image4BTofloat3
and/or
global void f3ToucharImage
global void CharTofloat3Image
the code dosen’t crash. I am thankfull for every advice