I have this huge function that i ported from a c application to run on cuda as a proof of concept for potential speedups.

The initial function isnt by me, and im inserting it back into a program that isnt mine, so im not able to provide a test application.

So this is really a shot in the dark, maybe itll jump at someones eyes but im not expecting miracle.

So basicaly, when i run it in emudebug, it is in concordance with the results i had with the original c function.

In debug mode, yes yes, the function does run, im not getting all 0’s or anything like that. But the results differ from slightly (3-4%) to a lot (50-100%).

The application is used to calculate the dose received in radiotherapy treatment and it outputs its results over a tomographic map of the patient, so i test results visually and i output them in text files and use the compare tool from office word to compare them in a more precise fashion.

So anyway… yup, difference between emudebug and debug/release.

Whats weird is that im pretty sure i did get the “wrong” result (that is, the one im getting constantly from debug/release") at least once in emudebug. Then, trying to figure out the source i changed all floats to doubles, and i got the “correct” result. When i switched back to floats, i continued to get the “correct” results.

Ill be pasting the function below, but as ive said, im not expecting any miracle. Maybe someone has had a similar experience.

This is running on a 8800gt, so no double precision. Id be happy to know it IS a float error, but what troubles me is that it works in emudebug and that, as far as i know, there isnt any “by default” double precision code in there.

I can say that ive tested values up to this block:

} else {

```
iip = (int)ifp;
idp = ifp - iip;
}
```

(its near the end) by utilizing all variables in the output array to make sure they didnt get optimized away.

```
__global__ void convolveCuda(float3 *points,PHOTON_DOSE *pdose,UNIT *unit,THREED_MAP *terma_map
ï¿½ Â ï¿½ Â ï¿½ Â Â ,short* tdata,unsigned char *ddata, short *ldata,float* udata,float *funcx, float* funcy, short* edata,
ï¿½ Â ï¿½ Â ï¿½ Â Â float *kernel_phi,float* kernel_r,float *kernel_dr,float *outval)
{
float3 point;
point.x=point.y=point.z=-10000.0f;
if(idx<params.numpoints)
Â point = points[idx];
float x = point.x;float y = point.y;float z = point.z;
if(!(x<-9000.0f&&y<-9000.0f))
{
Â float phi_shift,theta_shift,phi,sin_phi,cos_phi,theta,sin_theta,cos_theta,rx,ry,rz,rad_dist,
ï¿½ Â r,z_scale,bx,fx,ix,by,iy,fy,fz,iz,bz,t1,l1,d1,t2,l2,d2,terma=0,density=0.0f,rd=0.0f,x1,y1,z1,dot_product,phi_t,ifp,idp,
ï¿½ Â energy=0.0f,dval,dx,dy,dz;
Â float val=0.0f;
Â float c = 0.0f,ysum,t;
Â int offset,index,rindex,d_offset,iip,last_rindex;
Â int x_dim,xy_dim;
Â float tphi[3];
Â float t_kernel[4][3][2];
Â float angle_en[3];
Â /* cos and sin of the zenith and azimuth angle of PtD on CAX, */
Â float ï¿½ Â ï¿½ Â c1 = 1.0f, s1 = 0.0f, c2 = 1.0f, s2 = 0.0f;
Â float ï¿½ Â ï¿½ Â r1, r2, rx2, ry2, rz2;
Â int num_angles; /* # of angles to interpolate kernel value for TKA */
Â int tp; /* index of bounding kernal angles of phi_t */
Â int tr;
Â num_angles = 2; /* hard-coded for now. pdose->num_angles = 3 to be tested */
Â /* initialize the kernel tilting arrary */
Â for(int a = 0; a < 3; a++) Â //NUM_ANGLES
Â {
ï¿½ Â tphi[a] = 0.0f;
ï¿½ Â for (int d = 0; d < params.num; d++)
ï¿½ Â {
ï¿½ Â Â t_kernel[d][a][0] = 0.0f;
ï¿½ Â Â t_kernel[d][a][1] = 0.0f;
ï¿½ Â }
Â }
Â if(unit->source_ratio >0){
ï¿½ Â z_scale = pdose->z_projection/z;
Â } else
Â {
ï¿½ Â z_scale = (pdose->z_projection - unit->PSD)/(z-unit->PSD);
Â }
Â float nx = terma_map->inv_x_scale*(x * z_scale - terma_map->x_start);
Â float ny = terma_map->inv_y_scale*(y * z_scale - terma_map->y_start);
Â float nz = terma_map->inv_z_scale*(z - terma_map->z_start);
Â /* use this debug line only when there are a few points to be calced.
Â Otherwise, there will be a stream of data on the screen that won't
Â be helpful. Normally, comment this line out. */
Â //printf("x y z %f %f %f nx ny nz %f %f %f\n", x, y, z, nx, ny, nz);
Â
Â /* any point that isn't in the terma map, do not compute its dose */
Â
Â if (ny < 0.0f || ny > terma_map->y_dim-1)
ï¿½ Â {
ï¿½ Â if(idx<params.numpoints)
ï¿½ Â Â outval[idx]=0;
ï¿½ Â return;
Â }
Â if (nz < 0.0f || nz > terma_map->z_dim-1)
Â {
ï¿½ Â if(idx<params.numpoints)
ï¿½ Â Â outval[idx]=0; ï¿½ Â
ï¿½ Â return;
Â }
Â if (nx < 0.0f || nx > terma_map->x_dim-1)
Â {
ï¿½ Â if(idx<params.numpoints)
ï¿½ Â Â outval[idx]=0;
ï¿½ Â return;
Â }
Â x_dim = terma_map->x_dim;
Â xy_dim = x_dim*terma_map->y_dim;
Â /* computes the depth for PtD from ecd_map */
Â fx = nx;
Â fy = ny;
Â fz = nz;
Â ix = (int)fx;
Â iy = (int)fy;
Â iz = (int)fz;
Â index = ix + x_dim*iy + xy_dim*iz;
Â dx = fx - ix;
Â dy = fy - iy;
Â dz = fz - iz;
Â float e1 =
ï¿½ Â (1.0f-dx)*(1.0f-dy)*edata[index] +
ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*edata[index+1] +
ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*edata[index+x_dim] +
ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*edata[index+1+x_dim];
Â index += xy_dim;
Â float e2 =
ï¿½ Â (1.0f-dx)*(1.0f-dy)*edata[index] +
ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*edata[index+1] +
ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*edata[index+x_dim] +
ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*edata[index+1+x_dim];
Â float ec_depth = (1.0f-dz)*e1 + dz*e2;
Â ec_depth *= 0.005f;
Â r1 = sqrtf((float)(x*x + y*y)); /* distance from PtD to CAX */
Â r2 = z; /* distance from PtD to source */
Â if( r1 >= 1.0E-3f) /* i.e. r1 is near zero, PtD is near CAX */
Â {
ï¿½ Â /* r1 is distance from PtD to the CAX, perpendicularly */
ï¿½ Â /* r2 is distance from PtD calc to the beam source */
ï¿½ Â r2 = sqrtf(r1*r1 + z*z);
ï¿½ Â /* cos and sin of the zenith angle phi of the vector from the
ï¿½ Â source to PtD and CAX, which is from the source to SAD. */
ï¿½ Â c1 = z/r2;
ï¿½ Â s1 = r1/r2;
ï¿½ Â /* cos and sin of the azimuth angle (theta) of the vector from
ï¿½ Â the source to PtD.
ï¿½ Â 0 degree at -x axis, and increase towards -y axis. */
ï¿½ Â c2 = x/r1;
ï¿½ Â s2 = y/r1;
Â }
Â float d_phi = kernel_phi[1] - kernel_phi[0];
Â /* if(x == 0 && (int)ny == 0){
Â fh2=fopen("points.csv", "a");
Â fprintf(fh2, "%f, %f, %f, %f, %f, %f\n", x, y, z, nx, ny, nz);
Â fclose(fh2);
Â }
Â */
Â
Â for(int i=0;i<params.knphi;i++)
Â {
ï¿½ Â if(pdose->angle_shift) {
ï¿½ Â Â //printf("angle_shift is %d\n", pdose->angle_shift);
ï¿½ Â Â phi_shift = 0.125f * d_phi;
ï¿½ Â Â theta_shift = 0.125f*(i&7);
ï¿½ Â }else
ï¿½ Â {
ï¿½ Â Â //printf("no angle_shift\n");
ï¿½ Â Â phi_shift = 0;
ï¿½ Â Â theta_shift = 0;
ï¿½ Â }
ï¿½ Â for(int j=0;j<params.kntheta;j++)
ï¿½ Â {
ï¿½ Â Â phi = kernel_phi[i] + phi_shift *(j - 3);
ï¿½ Â Â sin_phi = __sinf(phi);
ï¿½ Â Â cos_phi = __cosf(phi);
ï¿½ Â Â /* offset the theta[0] by theta_shift with each phi iteration,
ï¿½ Â Â starting from 0 */
ï¿½ Â Â theta = (theta_shift + j)*2.0*PI/params.kntheta;
ï¿½ Â Â sin_theta = __sinf(theta);
ï¿½ Â Â cos_theta = __cosf(theta);
ï¿½ Â Â /* unit vector from PtD to PtI Â with angles phi and theta */
ï¿½ Â Â rx = sin_phi*cos_theta;
ï¿½ Â Â ry = sin_phi*sin_theta;
ï¿½ Â Â rz = -cos_phi;
ï¿½ Â Â rad_dist = 0.0f;
ï¿½ Â Â last_rindex = 0;
ï¿½ Â Â for(int k=1;k<params.knr;k++)
ï¿½ Â Â {
ï¿½ Â ï¿½ Â r = 0.5f*(kernel_r[k] + kernel_r[k-1]);
ï¿½ Â ï¿½ Â /* r of given(phi, theta) transformed to z in BEAM CS. */
ï¿½ Â ï¿½ Â bz = r*rz;
ï¿½ Â ï¿½ Â if(unit->source_ratio >0){
ï¿½ Â ï¿½ Â Â z_scale = pdose->z_projection/(z+bz);
ï¿½ Â ï¿½ Â } else
ï¿½ Â ï¿½ Â {
ï¿½ Â ï¿½ Â Â z_scale = (pdose->z_projection - unit->PSD)/(z-unit->PSD+bz);
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â /* r of given(phi, theta) transformed to x in BEAM CS. */
ï¿½ Â ï¿½ Â bx = r*rx;
ï¿½ Â ï¿½ Â /* fx is the index in x dimension of the terma map */
ï¿½ Â ï¿½ Â fx = terma_map->inv_x_scale*(z_scale*(bx+x)-terma_map->x_start);
ï¿½ Â ï¿½ Â /* ix is the integer part of fx */
ï¿½ Â ï¿½ Â ix = (int)fx;
ï¿½ Â ï¿½ Â /* no dose contribution from PtI out of TERMA map */
ï¿½ Â ï¿½ Â if (fx < 0 || ix > terma_map->x_dim-2) break;
ï¿½ Â ï¿½ Â /* similar as the above quantities in y and z dimension */
ï¿½ Â ï¿½ Â by = r*ry;
ï¿½ Â ï¿½ Â fy = terma_map->inv_y_scale*(z_scale*(by+y)-terma_map->y_start);
ï¿½ Â ï¿½ Â iy = (int)fy;
ï¿½ Â ï¿½ Â if (fy < 0 || iy > terma_map->y_dim-2) break;
ï¿½ Â ï¿½ Â fz = terma_map->inv_z_scale*(z + bz - terma_map->z_start);
ï¿½ Â ï¿½ Â iz = (int)fz;
ï¿½ Â ï¿½ Â if (fz < 0 || iz > terma_map->z_dim-2) break;
ï¿½ Â ï¿½ Â index = ix + x_dim*iy + xy_dim*iz;
ï¿½ Â ï¿½ Â if(r < pdose->max_r)
ï¿½ Â ï¿½ Â {
ï¿½ Â ï¿½ Â Â dx = fx - ix;
ï¿½ Â ï¿½ Â Â dy = fy - iy;
ï¿½ Â ï¿½ Â Â dz = fz - iz;
ï¿½ Â ï¿½ Â Â t1 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*tdata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*tdata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*tdata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*tdata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â d1 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*ddata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*ddata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*ddata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*ddata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â l1 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*ldata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*ldata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*ldata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*ldata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â index += xy_dim;
ï¿½ Â ï¿½ Â Â t2 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*tdata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*tdata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*tdata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*tdata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â d2 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*ddata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*ddata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*ddata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*ddata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â l2 =
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*(1.0f-dy)*ldata[index] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*(1.0f-dy)*ldata[index+1] +
ï¿½ Â ï¿½ Â ï¿½ Â (1.0f-dx)*( ï¿½ Â Â dy)*ldata[index+x_dim] +
ï¿½ Â ï¿½ Â ï¿½ Â ( ï¿½ Â Â dx)*( ï¿½ Â Â dy)*ldata[index+1+x_dim];
ï¿½ Â ï¿½ Â Â terma = (1.0f-dz)*t1 + dz*t2;
ï¿½ Â ï¿½ Â Â density = (1.0f-dz)*d1 + dz*d2;
ï¿½ Â ï¿½ Â Â rd = (1.0f-dz)*l1 + dz*l2;
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â else
ï¿½ Â ï¿½ Â {
ï¿½ Â ï¿½ Â Â terma = tdata[index];
ï¿½ Â ï¿½ Â Â density = ddata[index];
ï¿½ Â ï¿½ Â Â rd = ldata[index];
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â terma = terma/pdose->terma_scale;
ï¿½ Â ï¿½ Â density = 0.01f*density; // scale back to the actual density
ï¿½ Â ï¿½ Â rd = 0.005f*rd; // scale back to the actual radiological depth
ï¿½ Â ï¿½ Â /* use the kernel of chosen depth depth for convolution */
ï¿½ Â ï¿½ Â if(pdose->no_kh > 0) rd = funcx[pdose->no_kh];
ï¿½ Â ï¿½ Â rad_dist += density*kernel_dr[k]; /* dr[k] distance increment of kth r element */
ï¿½ Â ï¿½ Â rindex = (int)(rad_dist + 0.001f); // index to uniform radiological dist. array
ï¿½ Â ï¿½ Â //add 0.001 to avoid 1.999... to be round up to 1. It is better to be 2.
ï¿½ Â ï¿½ Â if (rindex >= params.knur) break; Â
ï¿½ Â ï¿½ Â /* BEAM coordinates of PtI */ ï¿½ Â Â
ï¿½ Â ï¿½ Â x1 = x + bx;
ï¿½ Â ï¿½ Â y1 = y + by;
ï¿½ Â ï¿½ Â z1 = z + bz;
ï¿½ Â ï¿½ Â dot_product = (bx*x1 + by*y1 +bz*z1)/(sqrtf(bx*bx+by*by+bz*bz)*sqrtf(x1*x1+y1*y1+z1*z1));
ï¿½ Â ï¿½ Â if(dot_product > 1) dot_product = 1.0f;
ï¿½ Â ï¿½ Â else if(dot_product < -1) dot_product = -1.0f;
ï¿½ Â ï¿½ Â
ï¿½ Â ï¿½ Â /* new kernel angle after tilting for volume element (phi, theta, r) */
ï¿½ Â ï¿½ Â phi_t = acosf(-dot_product);
ï¿½ Â ï¿½ Â
ï¿½ Â ï¿½ Â ifp = phi_t/d_phi - 0.5f; /* float index of phi_t */
ï¿½ Â ï¿½ Â if(ifp <= 0) {
ï¿½ Â ï¿½ Â Â iip = 0; /* integer index of phi_t 's small bounding angle */
ï¿½ Â ï¿½ Â Â idp = 0.0f;
ï¿½ Â ï¿½ Â } else if (ifp >= params.knphi - 1) {
ï¿½ Â ï¿½ Â Â iip = params.knphi - 2;
ï¿½ Â ï¿½ Â Â idp = 1.0f;
ï¿½ Â ï¿½ Â } else {
ï¿½ Â ï¿½ Â Â iip = (int)ifp;
ï¿½ Â ï¿½ Â Â idp = ifp - iip;
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â for (int a = 0; a < 2; a++)
ï¿½ Â ï¿½ Â {
ï¿½ Â ï¿½ Â Â /* neighboring angles for interpolation */
ï¿½ Â ï¿½ Â Â tphi[a] = kernel_phi[iip + a];
ï¿½ Â ï¿½ Â Â //printf("%f ", tphi[a]);
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â float *p_udata = &udata[iip * params.knur];
ï¿½ Â ï¿½ Â d_offset = params.knphi * params.knur;
ï¿½ Â ï¿½ Â /* Determine kernel values for the bounding voxels around (phi_t, r) */
ï¿½ Â ï¿½ Â for (int d = 0; d<params.num; d++){
ï¿½ Â ï¿½ Â Â for (int tp = 0; tp < 2; tp++){
ï¿½ Â ï¿½ Â ï¿½ Â offset = d*d_offset + tp * params.knur;
ï¿½ Â ï¿½ Â ï¿½ Â t_kernel[d][tp][0] = p_udata[offset + last_rindex];
ï¿½ Â ï¿½ Â ï¿½ Â t_kernel[d][tp][1] = p_udata[offset + rindex];
ï¿½ Â ï¿½ Â ï¿½ Â /* kernel energy for (phi_t, r)'s neighbouring voxels,
ï¿½ Â ï¿½ Â ï¿½ Â normalized by the voxel's solid angle */
ï¿½ Â ï¿½ Â ï¿½ Â angle_en[tp] = (t_kernel[d][tp][1] - t_kernel[d][tp][0])/__sinf(tphi[tp]);
ï¿½ Â ï¿½ Â ï¿½ Â //printf("%d %f %f %f\n", offset, t_kernel[d][tp][0], t_kernel[d][tp][1], angle_en[tp]);
ï¿½ Â ï¿½ Â Â }
ï¿½ Â ï¿½ Â Â funcy[d] = (1-idp) * angle_en[0] + idp * angle_en[1];
ï¿½ Â ï¿½ Â }
ï¿½ Â ï¿½ Â /* interpolate for depth hardened kernel, */
ï¿½ Â ï¿½ Â energy = d_interp_func(rd,params.num,funcx,funcy,params.clamp)*__sinf (phi);
ï¿½ Â ï¿½ Â //energy = __sinf (phi)/10000.0f*phi_t*iip*idp;
ï¿½ Â ï¿½ Â /* kernel energy is specified for 360 degree ring at phi_t.
ï¿½ Â ï¿½ Â DOSE space is divided into num_theta sections for the 360 degree ring at phi. */
ï¿½ Â ï¿½ Â dval = terma*energy/params.kntheta;
ï¿½ Â ï¿½ Â val+=dval;
ï¿½ Â ï¿½ Â last_rindex = rindex; ï¿½ Â ï¿½ Â
ï¿½ Â Â }
ï¿½ Â }
Â }
Â //printf("val: %d\n",outval[idx]);
Â if(idx<params.numpoints)
ï¿½ Â outval[idx]=val;
}
else
{
Â if(idx<params.numpoints)
ï¿½ Â outval[idx]=0;
}
}
```

```
__device__ float d_interp_func(float x, int num, float *X, float *Y, int clamp)
{ Â int Â i;
Â Â float dx;
Â Â if (num == 0) return(0.0f);
Â Â if (x < X[0]) {
if (clamp) return(Y[0]);
dx = (x - X[0])/(X[1] - X[0]);
return((1.0f - dx)*Y[0] + dx*Y[1]);
Â Â }
Â Â for (i = 1; i < num; i++) {
if (x < X[i]) {
Â Â dx = (x - X[i-1])/(X[i] - X[i-1]);
Â Â return((1.0f - dx)*Y[i-1] + dx*Y[i]);
}
Â Â }
Â Â /* x is greater than or equal to X[num-1] */
Â Â if (clamp) return(Y[num-1]);
Â Â i = num - 1;
Â Â dx = (x - X[i-1])/(X[i] - X[i-1]);
Â Â return((1.0f - dx)*Y[i-1] + dx*Y[i]);
}
```

Yes… its huge.