cannot correctly set value to device memory strange error, four statements, remove any three can wo

I’m new to study CUDA, and before these days, I used HLSL and this code worked well using HLSL, but when I changed it to CUDA, I met serival strange errors.
Can anyone help me?
the following function is the first function that failed, it’s something like raycasting, but I want store the attenuation of the rays into 16 DCT coefficient. I check the depth by changes it to a image and render it. I find when I remove any three statements of the four in function caldctcoeffs() it will render correctly, otherwise, it will be all dark (all zero), even I set any non-zero value at the end of function attenMap_kernel(), it’s still dark.

// this is the function failed
global
void attenMap_kernel(float* depth_map, float* near_map, float* steplen_map, float* dct_coeffs_map1)
{
uint x = d_map_xoffset + blockIdx.xblockDim.x + threadIdx.x;
uint y = d_map_yoffset + blockIdx.y
blockDim.y + threadIdx.y;
if ((x >= mapWidth) || (y >= mapHeight)) return;

float u = x - mapWidth / 2.0f;
float v = mapHeight / 2.0f - y;
Ray eyeRay;
eyeRay.o = lightPos;
eyeRay.d = normalize(make_float3(u, v, -mapFocal));

eyeRay.d = mul(light_invMatrix,eyeRay.d);

float tnear, tfar;
int hit = intersectBox(eyeRay, d_box_min, d_box_max, &tnear, &tfar);
if (!hit) return;
if (tnear < 0.0f) tnear = 0.0f;     // clamp to near plane
float tstep = (tfar-tnear)/attenSteps;

float3  pos = eyeRay.o + eyeRay.d*tnear;
float3 step = eyeRay.d*tstep;
float depth = tnear;

float4 outcoeff1 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff2 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff3 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff4 = make_float4(0.0f,0.0f,0.0f,0.0f);


float3 coord;
float tmp_atten = 1.0f;
const float pi = 3.1415926535f;

for(int i = 0;i < attenSteps;i++)
{
	coord = (pos-d_box_min)/d_box_size;
	uint occupyed = (uint)tex3D(tex_occupy,coord.x,coord.y,coord.z);
	if( occupyed > 127)
		break;
	
	float angle = (2.0f*(float)i+1.0f)*pi/(2.0f*(float)attenSteps);
	float dat = tex3D(tex_data,coord.x,coord.y,coord.z);
	tmp_atten = tmp_atten * expf(-dat*tstep*dat);
	float dis = plattcoeffa + plattcoeffb * depth;
	float att_intensity = tmp_atten  / (dis*dis);
	

	pos = pos + step;
	depth = depth + tstep;	
	caldctcoeffs(&outcoeff1,&outcoeff2,&outcoeff3,&outcoeff4,att_intensity,angle);
	
}


depth_map[y*mapWidth+x] = (depth - 3.0f*1.732f) * 0.3f;
near_map[y*mapWidth+x] = tnear;
steplen_map[y*mapWidth+x] = tstep;
uint map_size = mapWidth*mapHeight;

dct_coeffs_map1[y*mapWidth+x           ] = outcoeff1.x;
dct_coeffs_map1[y*mapWidth+x+map_size  ] = outcoeff1.y;
dct_coeffs_map1[y*mapWidth+x+map_size*2] = outcoeff1.z;
dct_coeffs_map1[y*mapWidth+x+map_size*3] = outcoeff1.w;
dct_coeffs_map1[y*mapWidth+x+map_size*4] = outcoeff2.x;
dct_coeffs_map1[y*mapWidth+x+map_size*5] = outcoeff2.y;
dct_coeffs_map1[y*mapWidth+x+map_size*6] = outcoeff2.z;
dct_coeffs_map1[y*mapWidth+x+map_size*7] = outcoeff2.w;
dct_coeffs_map1[y*mapWidth+x+map_size*8] = outcoeff3.x;
dct_coeffs_map1[y*mapWidth+x+map_size*9] = outcoeff3.y;
dct_coeffs_map1[y*mapWidth+x+map_size*10] = outcoeff3.z;
dct_coeffs_map1[y*mapWidth+x+map_size*11] = outcoeff3.w;
dct_coeffs_map1[y*mapWidth+x+map_size*12] = outcoeff4.x;
dct_coeffs_map1[y*mapWidth+x+map_size*13] = outcoeff4.y;
dct_coeffs_map1[y*mapWidth+x+map_size*14] = outcoeff4.z;
dct_coeffs_map1[y*mapWidth+x+map_size*15] = outcoeff4.w;

}

// remove any three statements of this function, it will work
inline device
void caldctcoeffs(float4 * coeff1, float4* coeff2, float4 * coeff3, float4* coeff4,
float att_intensity,float angle)
{
coeff1 += att_intensity * make_float4(
u0 * cosf(0.0f),
u1 * cosf(angle),
u1 * cosf(2.0f
angle),
u1 * cosf(3.0fangle));
coeff2 += att_intensity * u1 make_float4(
cosf(4.0f
angle),
cosf(5.0fangle),
cosf(6.0f
angle),
cosf(7.0fangle));
coeff3 += att_intensity * u1make_float4(
cosf(8.0f
angle),
cosf(9.0fangle),
cosf(10.0f
angle),
cosf(11.0fangle));
coeff4 += att_intensity * u1make_float4(
cosf(12.0f
angle),
cosf(13.0fangle),
cosf(14.0f
angle),
cosf(15.0f*angle));

}

I’m new to study CUDA, and before these days, I used HLSL and this code worked well using HLSL, but when I changed it to CUDA, I met serival strange errors.
Can anyone help me?
the following function is the first function that failed, it’s something like raycasting, but I want store the attenuation of the rays into 16 DCT coefficient. I check the depth by changes it to a image and render it. I find when I remove any three statements of the four in function caldctcoeffs() it will render correctly, otherwise, it will be all dark (all zero), even I set any non-zero value at the end of function attenMap_kernel(), it’s still dark.

// this is the function failed
global
void attenMap_kernel(float* depth_map, float* near_map, float* steplen_map, float* dct_coeffs_map1)
{
uint x = d_map_xoffset + blockIdx.xblockDim.x + threadIdx.x;
uint y = d_map_yoffset + blockIdx.y
blockDim.y + threadIdx.y;
if ((x >= mapWidth) || (y >= mapHeight)) return;

float u = x - mapWidth / 2.0f;
float v = mapHeight / 2.0f - y;
Ray eyeRay;
eyeRay.o = lightPos;
eyeRay.d = normalize(make_float3(u, v, -mapFocal));

eyeRay.d = mul(light_invMatrix,eyeRay.d);

float tnear, tfar;
int hit = intersectBox(eyeRay, d_box_min, d_box_max, &tnear, &tfar);
if (!hit) return;
if (tnear < 0.0f) tnear = 0.0f;     // clamp to near plane
float tstep = (tfar-tnear)/attenSteps;

float3  pos = eyeRay.o + eyeRay.d*tnear;
float3 step = eyeRay.d*tstep;
float depth = tnear;

float4 outcoeff1 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff2 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff3 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff4 = make_float4(0.0f,0.0f,0.0f,0.0f);


float3 coord;
float tmp_atten = 1.0f;
const float pi = 3.1415926535f;

for(int i = 0;i < attenSteps;i++)
{
	coord = (pos-d_box_min)/d_box_size;
	uint occupyed = (uint)tex3D(tex_occupy,coord.x,coord.y,coord.z);
	if( occupyed > 127)
		break;
	
	float angle = (2.0f*(float)i+1.0f)*pi/(2.0f*(float)attenSteps);
	float dat = tex3D(tex_data,coord.x,coord.y,coord.z);
	tmp_atten = tmp_atten * expf(-dat*tstep*dat);
	float dis = plattcoeffa + plattcoeffb * depth;
	float att_intensity = tmp_atten  / (dis*dis);
	

	pos = pos + step;
	depth = depth + tstep;	
	caldctcoeffs(&outcoeff1,&outcoeff2,&outcoeff3,&outcoeff4,att_intensity,angle);
	
}


depth_map[y*mapWidth+x] = (depth - 3.0f*1.732f) * 0.3f;
near_map[y*mapWidth+x] = tnear;
steplen_map[y*mapWidth+x] = tstep;
uint map_size = mapWidth*mapHeight;

dct_coeffs_map1[y*mapWidth+x           ] = outcoeff1.x;
dct_coeffs_map1[y*mapWidth+x+map_size  ] = outcoeff1.y;
dct_coeffs_map1[y*mapWidth+x+map_size*2] = outcoeff1.z;
dct_coeffs_map1[y*mapWidth+x+map_size*3] = outcoeff1.w;
dct_coeffs_map1[y*mapWidth+x+map_size*4] = outcoeff2.x;
dct_coeffs_map1[y*mapWidth+x+map_size*5] = outcoeff2.y;
dct_coeffs_map1[y*mapWidth+x+map_size*6] = outcoeff2.z;
dct_coeffs_map1[y*mapWidth+x+map_size*7] = outcoeff2.w;
dct_coeffs_map1[y*mapWidth+x+map_size*8] = outcoeff3.x;
dct_coeffs_map1[y*mapWidth+x+map_size*9] = outcoeff3.y;
dct_coeffs_map1[y*mapWidth+x+map_size*10] = outcoeff3.z;
dct_coeffs_map1[y*mapWidth+x+map_size*11] = outcoeff3.w;
dct_coeffs_map1[y*mapWidth+x+map_size*12] = outcoeff4.x;
dct_coeffs_map1[y*mapWidth+x+map_size*13] = outcoeff4.y;
dct_coeffs_map1[y*mapWidth+x+map_size*14] = outcoeff4.z;
dct_coeffs_map1[y*mapWidth+x+map_size*15] = outcoeff4.w;

}

// remove any three statements of this function, it will work
inline device
void caldctcoeffs(float4 * coeff1, float4* coeff2, float4 * coeff3, float4* coeff4,
float att_intensity,float angle)
{
coeff1 += att_intensity * make_float4(
u0 * cosf(0.0f),
u1 * cosf(angle),
u1 * cosf(2.0f
angle),
u1 * cosf(3.0fangle));
coeff2 += att_intensity * u1 make_float4(
cosf(4.0f
angle),
cosf(5.0fangle),
cosf(6.0f
angle),
cosf(7.0fangle));
coeff3 += att_intensity * u1make_float4(
cosf(8.0f
angle),
cosf(9.0fangle),
cosf(10.0f
angle),
cosf(11.0fangle));
coeff4 += att_intensity * u1make_float4(
cosf(12.0f
angle),
cosf(13.0fangle),
cosf(14.0f
angle),
cosf(15.0f*angle));

}

My first guess would be the kernel runs to long because of all the cosf and it gets aborted or something.

Perhaps you should check if runtime limit for kernels is off in device capabilities (also known as kernel exec timeout).

For debugging purposes the watchdog also needed to be turned off, I think it was in nsight monitor.

My first guess would be the kernel runs to long because of all the cosf and it gets aborted or something.

Perhaps you should check if runtime limit for kernels is off in device capabilities (also known as kernel exec timeout).

For debugging purposes the watchdog also needed to be turned off, I think it was in nsight monitor.

Thank you for your reply. But I have tried to change the loop to only 2 times ,but it still doesnot work.

I even change the code like this :

for(int i = 0;i < 1;/attenSteps;/i++)

{

	depth_map[y*mapWidth+x] =  1.0f; /////////////////////// I set the depth_map here to 1.0f

	coord = (pos-d_box_min)/d_box_size;

	uint occupyed = (uint)tex3D(tex_occupy,coord.x,coord.y,coord.z);

	if( occupyed > 127)

		break;

	

	float angle = (2.0f*(float)i+1.0f)*pi/(2.0f*(float)attenSteps);

	float dat = tex3D(tex_data,coord.x,coord.y,coord.z);

	tmp_atten = tmp_atten * expf(-dat*tstep*dat);

	float dis = plattcoeffa + plattcoeffb * depth;

	float att_intensity = tmp_atten  / (dis*dis);

	

	pos = pos + step;

	depth = depth + tstep;	

	caldctcoeffs(&outcoeff1,&outcoeff2,&outcoeff3,&outcoeff4,att_intensity,angle);

	

}

but the image of depth_map is still dark.

Thank you for your reply. But I have tried to change the loop to only 2 times ,but it still doesnot work.

I even change the code like this :

for(int i = 0;i < 1;/attenSteps;/i++)

{

	depth_map[y*mapWidth+x] =  1.0f; /////////////////////// I set the depth_map here to 1.0f

	coord = (pos-d_box_min)/d_box_size;

	uint occupyed = (uint)tex3D(tex_occupy,coord.x,coord.y,coord.z);

	if( occupyed > 127)

		break;

	

	float angle = (2.0f*(float)i+1.0f)*pi/(2.0f*(float)attenSteps);

	float dat = tex3D(tex_data,coord.x,coord.y,coord.z);

	tmp_atten = tmp_atten * expf(-dat*tstep*dat);

	float dis = plattcoeffa + plattcoeffb * depth;

	float att_intensity = tmp_atten  / (dis*dis);

	

	pos = pos + step;

	depth = depth + tstep;	

	caldctcoeffs(&outcoeff1,&outcoeff2,&outcoeff3,&outcoeff4,att_intensity,angle);

	

}

but the image of depth_map is still dark.