I’m new to study CUDA, and before these days, I used HLSL and this code worked well using HLSL, but when I changed it to CUDA, I met serival strange errors.
Can anyone help me?
the following function is the first function that failed, it’s something like raycasting, but I want store the attenuation of the rays into 16 DCT coefficient. I check the depth by changes it to a image and render it. I find when I remove any three statements of the four in function caldctcoeffs() it will render correctly, otherwise, it will be all dark (all zero), even I set any non-zero value at the end of function attenMap_kernel(), it’s still dark.
// this is the function failed
global
void attenMap_kernel(float* depth_map, float* near_map, float* steplen_map, float* dct_coeffs_map1)
{
uint x = d_map_xoffset + blockIdx.xblockDim.x + threadIdx.x;
uint y = d_map_yoffset + blockIdx.yblockDim.y + threadIdx.y;
if ((x >= mapWidth) || (y >= mapHeight)) return;
float u = x - mapWidth / 2.0f;
float v = mapHeight / 2.0f - y;
Ray eyeRay;
eyeRay.o = lightPos;
eyeRay.d = normalize(make_float3(u, v, -mapFocal));
eyeRay.d = mul(light_invMatrix,eyeRay.d);
float tnear, tfar;
int hit = intersectBox(eyeRay, d_box_min, d_box_max, &tnear, &tfar);
if (!hit) return;
if (tnear < 0.0f) tnear = 0.0f; // clamp to near plane
float tstep = (tfar-tnear)/attenSteps;
float3 pos = eyeRay.o + eyeRay.d*tnear;
float3 step = eyeRay.d*tstep;
float depth = tnear;
float4 outcoeff1 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff2 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff3 = make_float4(0.0f,0.0f,0.0f,0.0f);
float4 outcoeff4 = make_float4(0.0f,0.0f,0.0f,0.0f);
float3 coord;
float tmp_atten = 1.0f;
const float pi = 3.1415926535f;
for(int i = 0;i < attenSteps;i++)
{
coord = (pos-d_box_min)/d_box_size;
uint occupyed = (uint)tex3D(tex_occupy,coord.x,coord.y,coord.z);
if( occupyed > 127)
break;
float angle = (2.0f*(float)i+1.0f)*pi/(2.0f*(float)attenSteps);
float dat = tex3D(tex_data,coord.x,coord.y,coord.z);
tmp_atten = tmp_atten * expf(-dat*tstep*dat);
float dis = plattcoeffa + plattcoeffb * depth;
float att_intensity = tmp_atten / (dis*dis);
pos = pos + step;
depth = depth + tstep;
caldctcoeffs(&outcoeff1,&outcoeff2,&outcoeff3,&outcoeff4,att_intensity,angle);
}
depth_map[y*mapWidth+x] = (depth - 3.0f*1.732f) * 0.3f;
near_map[y*mapWidth+x] = tnear;
steplen_map[y*mapWidth+x] = tstep;
uint map_size = mapWidth*mapHeight;
dct_coeffs_map1[y*mapWidth+x ] = outcoeff1.x;
dct_coeffs_map1[y*mapWidth+x+map_size ] = outcoeff1.y;
dct_coeffs_map1[y*mapWidth+x+map_size*2] = outcoeff1.z;
dct_coeffs_map1[y*mapWidth+x+map_size*3] = outcoeff1.w;
dct_coeffs_map1[y*mapWidth+x+map_size*4] = outcoeff2.x;
dct_coeffs_map1[y*mapWidth+x+map_size*5] = outcoeff2.y;
dct_coeffs_map1[y*mapWidth+x+map_size*6] = outcoeff2.z;
dct_coeffs_map1[y*mapWidth+x+map_size*7] = outcoeff2.w;
dct_coeffs_map1[y*mapWidth+x+map_size*8] = outcoeff3.x;
dct_coeffs_map1[y*mapWidth+x+map_size*9] = outcoeff3.y;
dct_coeffs_map1[y*mapWidth+x+map_size*10] = outcoeff3.z;
dct_coeffs_map1[y*mapWidth+x+map_size*11] = outcoeff3.w;
dct_coeffs_map1[y*mapWidth+x+map_size*12] = outcoeff4.x;
dct_coeffs_map1[y*mapWidth+x+map_size*13] = outcoeff4.y;
dct_coeffs_map1[y*mapWidth+x+map_size*14] = outcoeff4.z;
dct_coeffs_map1[y*mapWidth+x+map_size*15] = outcoeff4.w;
}
// remove any three statements of this function, it will work
inline device
void caldctcoeffs(float4 * coeff1, float4* coeff2, float4 * coeff3, float4* coeff4,
float att_intensity,float angle)
{
coeff1 += att_intensity * make_float4(
u0 * cosf(0.0f),
u1 * cosf(angle),
u1 * cosf(2.0fangle),
u1 * cosf(3.0fangle));
coeff2 += att_intensity * u1 make_float4(
cosf(4.0fangle),
cosf(5.0fangle),
cosf(6.0fangle),
cosf(7.0fangle));
coeff3 += att_intensity * u1make_float4(
cosf(8.0fangle),
cosf(9.0fangle),
cosf(10.0fangle),
cosf(11.0fangle));
coeff4 += att_intensity * u1make_float4(
cosf(12.0fangle),
cosf(13.0fangle),
cosf(14.0fangle),
cosf(15.0f*angle));
}