I have this huge function that i ported from a c application to run on cuda as a proof of concept for potential speedups.
The initial function isnt by me, and im inserting it back into a program that isnt mine, so im not able to provide a test application.
So this is really a shot in the dark, maybe itll jump at someones eyes but im not expecting miracle.
So basicaly, when i run it in emudebug, it is in concordance with the results i had with the original c function.
In debug mode, yes yes, the function does run, im not getting all 0’s or anything like that. But the results differ from slightly (3-4%) to a lot (50-100%).
The application is used to calculate the dose received in radiotherapy treatment and it outputs its results over a tomographic map of the patient, so i test results visually and i output them in text files and use the compare tool from office word to compare them in a more precise fashion.
So anyway… yup, difference between emudebug and debug/release.
Whats weird is that im pretty sure i did get the “wrong” result (that is, the one im getting constantly from debug/release") at least once in emudebug. Then, trying to figure out the source i changed all floats to doubles, and i got the “correct” result. When i switched back to floats, i continued to get the “correct” results.
Ill be pasting the function below, but as ive said, im not expecting any miracle. Maybe someone has had a similar experience.
This is running on a 8800gt, so no double precision. Id be happy to know it IS a float error, but what troubles me is that it works in emudebug and that, as far as i know, there isnt any “by default” double precision code in there.
I can say that ive tested values up to this block:
} else {
iip = (int)ifp;
idp = ifp - iip;
}
(its near the end) by utilizing all variables in the output array to make sure they didnt get optimized away.
__global__ void convolveCuda(float3 *points,PHOTON_DOSE *pdose,UNIT *unit,THREED_MAP *terma_map
�  �  �   ,short* tdata,unsigned char *ddata, short *ldata,float* udata,float *funcx, float* funcy, short* edata,
�  �  �   float *kernel_phi,float* kernel_r,float *kernel_dr,float *outval)
{
float3 point;
point.x=point.y=point.z=-10000.0f;
if(idx<params.numpoints)
 point = points[idx];
float x = point.x;float y = point.y;float z = point.z;
if(!(x<-9000.0f&&y<-9000.0f))
{
 float phi_shift,theta_shift,phi,sin_phi,cos_phi,theta,sin_theta,cos_theta,rx,ry,rz,rad_dist,
�  r,z_scale,bx,fx,ix,by,iy,fy,fz,iz,bz,t1,l1,d1,t2,l2,d2,terma=0,density=0.0f,rd=0.0f,x1,y1,z1,dot_product,phi_t,ifp,idp,
�  energy=0.0f,dval,dx,dy,dz;
 float val=0.0f;
 float c = 0.0f,ysum,t;
 int offset,index,rindex,d_offset,iip,last_rindex;
 int x_dim,xy_dim;
 float tphi[3];
 float t_kernel[4][3][2];
 float angle_en[3];
 /* cos and sin of the zenith and azimuth angle of PtD on CAX, */
 float �  �  c1 = 1.0f, s1 = 0.0f, c2 = 1.0f, s2 = 0.0f;
 float �  �  r1, r2, rx2, ry2, rz2;
 int num_angles; /* # of angles to interpolate kernel value for TKA */
 int tp; /* index of bounding kernal angles of phi_t */
 int tr;
 num_angles = 2; /* hard-coded for now. pdose->num_angles = 3 to be tested */
 /* initialize the kernel tilting arrary */
 for(int a = 0; a < 3; a++)  //NUM_ANGLES
 {
�  tphi[a] = 0.0f;
�  for (int d = 0; d < params.num; d++)
�  {
�   t_kernel[d][a][0] = 0.0f;
�   t_kernel[d][a][1] = 0.0f;
�  }
 }
 if(unit->source_ratio >0){
�  z_scale = pdose->z_projection/z;
 } else
 {
�  z_scale = (pdose->z_projection - unit->PSD)/(z-unit->PSD);
 }
 float nx = terma_map->inv_x_scale*(x * z_scale - terma_map->x_start);
 float ny = terma_map->inv_y_scale*(y * z_scale - terma_map->y_start);
 float nz = terma_map->inv_z_scale*(z - terma_map->z_start);
 /* use this debug line only when there are a few points to be calced.
 Otherwise, there will be a stream of data on the screen that won't
 be helpful. Normally, comment this line out. */
 //printf("x y z %f %f %f nx ny nz %f %f %f\n", x, y, z, nx, ny, nz);
Â
 /* any point that isn't in the terma map, do not compute its dose */
Â
 if (ny < 0.0f || ny > terma_map->y_dim-1)
�  {
�  if(idx<params.numpoints)
�   outval[idx]=0;
�  return;
 }
 if (nz < 0.0f || nz > terma_map->z_dim-1)
 {
�  if(idx<params.numpoints)
�   outval[idx]=0; � Â
�  return;
 }
 if (nx < 0.0f || nx > terma_map->x_dim-1)
 {
�  if(idx<params.numpoints)
�   outval[idx]=0;
�  return;
 }
 x_dim = terma_map->x_dim;
 xy_dim = x_dim*terma_map->y_dim;
 /* computes the depth for PtD from ecd_map */
 fx = nx;
 fy = ny;
 fz = nz;
 ix = (int)fx;
 iy = (int)fy;
 iz = (int)fz;
 index = ix + x_dim*iy + xy_dim*iz;
 dx = fx - ix;
 dy = fy - iy;
 dz = fz - iz;
 float e1 =
�  (1.0f-dx)*(1.0f-dy)*edata[index] +
�  ( �   dx)*(1.0f-dy)*edata[index+1] +
�  (1.0f-dx)*( �   dy)*edata[index+x_dim] +
�  ( �   dx)*( �   dy)*edata[index+1+x_dim];
 index += xy_dim;
 float e2 =
�  (1.0f-dx)*(1.0f-dy)*edata[index] +
�  ( �   dx)*(1.0f-dy)*edata[index+1] +
�  (1.0f-dx)*( �   dy)*edata[index+x_dim] +
�  ( �   dx)*( �   dy)*edata[index+1+x_dim];
 float ec_depth = (1.0f-dz)*e1 + dz*e2;
 ec_depth *= 0.005f;
 r1 = sqrtf((float)(x*x + y*y)); /* distance from PtD to CAX */
 r2 = z; /* distance from PtD to source */
 if( r1 >= 1.0E-3f) /* i.e. r1 is near zero, PtD is near CAX */
 {
�  /* r1 is distance from PtD to the CAX, perpendicularly */
�  /* r2 is distance from PtD calc to the beam source */
�  r2 = sqrtf(r1*r1 + z*z);
�  /* cos and sin of the zenith angle phi of the vector from the
�  source to PtD and CAX, which is from the source to SAD. */
�  c1 = z/r2;
�  s1 = r1/r2;
�  /* cos and sin of the azimuth angle (theta) of the vector from
�  the source to PtD.
�  0 degree at -x axis, and increase towards -y axis. */
�  c2 = x/r1;
�  s2 = y/r1;
 }
 float d_phi = kernel_phi[1] - kernel_phi[0];
 /* if(x == 0 && (int)ny == 0){
 fh2=fopen("points.csv", "a");
 fprintf(fh2, "%f, %f, %f, %f, %f, %f\n", x, y, z, nx, ny, nz);
 fclose(fh2);
 }
 */
Â
 for(int i=0;i<params.knphi;i++)
 {
�  if(pdose->angle_shift) {
�   //printf("angle_shift is %d\n", pdose->angle_shift);
�   phi_shift = 0.125f * d_phi;
�   theta_shift = 0.125f*(i&7);
�  }else
�  {
�   //printf("no angle_shift\n");
�   phi_shift = 0;
�   theta_shift = 0;
�  }
�  for(int j=0;j<params.kntheta;j++)
�  {
�   phi = kernel_phi[i] + phi_shift *(j - 3);
�   sin_phi = __sinf(phi);
�   cos_phi = __cosf(phi);
�   /* offset the theta[0] by theta_shift with each phi iteration,
�   starting from 0 */
�   theta = (theta_shift + j)*2.0*PI/params.kntheta;
�   sin_theta = __sinf(theta);
�   cos_theta = __cosf(theta);
�   /* unit vector from PtD to PtI  with angles phi and theta */
�   rx = sin_phi*cos_theta;
�   ry = sin_phi*sin_theta;
�   rz = -cos_phi;
�   rad_dist = 0.0f;
�   last_rindex = 0;
�   for(int k=1;k<params.knr;k++)
�   {
�  �  r = 0.5f*(kernel_r[k] + kernel_r[k-1]);
�  �  /* r of given(phi, theta) transformed to z in BEAM CS. */
�  �  bz = r*rz;
�  �  if(unit->source_ratio >0){
�  �   z_scale = pdose->z_projection/(z+bz);
�  �  } else
�  �  {
�  �   z_scale = (pdose->z_projection - unit->PSD)/(z-unit->PSD+bz);
�  �  }
�  �  /* r of given(phi, theta) transformed to x in BEAM CS. */
�  �  bx = r*rx;
�  �  /* fx is the index in x dimension of the terma map */
�  �  fx = terma_map->inv_x_scale*(z_scale*(bx+x)-terma_map->x_start);
�  �  /* ix is the integer part of fx */
�  �  ix = (int)fx;
�  �  /* no dose contribution from PtI out of TERMA map */
�  �  if (fx < 0 || ix > terma_map->x_dim-2) break;
�  �  /* similar as the above quantities in y and z dimension */
�  �  by = r*ry;
�  �  fy = terma_map->inv_y_scale*(z_scale*(by+y)-terma_map->y_start);
�  �  iy = (int)fy;
�  �  if (fy < 0 || iy > terma_map->y_dim-2) break;
�  �  fz = terma_map->inv_z_scale*(z + bz - terma_map->z_start);
�  �  iz = (int)fz;
�  �  if (fz < 0 || iz > terma_map->z_dim-2) break;
�  �  index = ix + x_dim*iy + xy_dim*iz;
�  �  if(r < pdose->max_r)
�  �  {
�  �   dx = fx - ix;
�  �   dy = fy - iy;
�  �   dz = fz - iz;
�  �   t1 =
�  �  �  (1.0f-dx)*(1.0f-dy)*tdata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*tdata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*tdata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*tdata[index+1+x_dim];
�  �   d1 =
�  �  �  (1.0f-dx)*(1.0f-dy)*ddata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*ddata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*ddata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*ddata[index+1+x_dim];
�  �   l1 =
�  �  �  (1.0f-dx)*(1.0f-dy)*ldata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*ldata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*ldata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*ldata[index+1+x_dim];
�  �   index += xy_dim;
�  �   t2 =
�  �  �  (1.0f-dx)*(1.0f-dy)*tdata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*tdata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*tdata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*tdata[index+1+x_dim];
�  �   d2 =
�  �  �  (1.0f-dx)*(1.0f-dy)*ddata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*ddata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*ddata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*ddata[index+1+x_dim];
�  �   l2 =
�  �  �  (1.0f-dx)*(1.0f-dy)*ldata[index] +
�  �  �  ( �   dx)*(1.0f-dy)*ldata[index+1] +
�  �  �  (1.0f-dx)*( �   dy)*ldata[index+x_dim] +
�  �  �  ( �   dx)*( �   dy)*ldata[index+1+x_dim];
�  �   terma = (1.0f-dz)*t1 + dz*t2;
�  �   density = (1.0f-dz)*d1 + dz*d2;
�  �   rd = (1.0f-dz)*l1 + dz*l2;
�  �  }
�  �  else
�  �  {
�  �   terma = tdata[index];
�  �   density = ddata[index];
�  �   rd = ldata[index];
�  �  }
�  �  terma = terma/pdose->terma_scale;
�  �  density = 0.01f*density; // scale back to the actual density
�  �  rd = 0.005f*rd; // scale back to the actual radiological depth
�  �  /* use the kernel of chosen depth depth for convolution */
�  �  if(pdose->no_kh > 0) rd = funcx[pdose->no_kh];
�  �  rad_dist += density*kernel_dr[k]; /* dr[k] distance increment of kth r element */
�  �  rindex = (int)(rad_dist + 0.001f); // index to uniform radiological dist. array
�  �  //add 0.001 to avoid 1.999... to be round up to 1. It is better to be 2.
�  �  if (rindex >= params.knur) break; Â
�  �  /* BEAM coordinates of PtI */ �  Â
�  �  x1 = x + bx;
�  �  y1 = y + by;
�  �  z1 = z + bz;
�  �  dot_product = (bx*x1 + by*y1 +bz*z1)/(sqrtf(bx*bx+by*by+bz*bz)*sqrtf(x1*x1+y1*y1+z1*z1));
�  �  if(dot_product > 1) dot_product = 1.0f;
�  �  else if(dot_product < -1) dot_product = -1.0f;
�  � Â
�  �  /* new kernel angle after tilting for volume element (phi, theta, r) */
�  �  phi_t = acosf(-dot_product);
�  � Â
�  �  ifp = phi_t/d_phi - 0.5f; /* float index of phi_t */
�  �  if(ifp <= 0) {
�  �   iip = 0; /* integer index of phi_t 's small bounding angle */
�  �   idp = 0.0f;
�  �  } else if (ifp >= params.knphi - 1) {
�  �   iip = params.knphi - 2;
�  �   idp = 1.0f;
�  �  } else {
�  �   iip = (int)ifp;
�  �   idp = ifp - iip;
�  �  }
�  �  for (int a = 0; a < 2; a++)
�  �  {
�  �   /* neighboring angles for interpolation */
�  �   tphi[a] = kernel_phi[iip + a];
�  �   //printf("%f ", tphi[a]);
�  �  }
�  �  float *p_udata = &udata[iip * params.knur];
�  �  d_offset = params.knphi * params.knur;
�  �  /* Determine kernel values for the bounding voxels around (phi_t, r) */
�  �  for (int d = 0; d<params.num; d++){
�  �   for (int tp = 0; tp < 2; tp++){
�  �  �  offset = d*d_offset + tp * params.knur;
�  �  �  t_kernel[d][tp][0] = p_udata[offset + last_rindex];
�  �  �  t_kernel[d][tp][1] = p_udata[offset + rindex];
�  �  �  /* kernel energy for (phi_t, r)'s neighbouring voxels,
�  �  �  normalized by the voxel's solid angle */
�  �  �  angle_en[tp] = (t_kernel[d][tp][1] - t_kernel[d][tp][0])/__sinf(tphi[tp]);
�  �  �  //printf("%d %f %f %f\n", offset, t_kernel[d][tp][0], t_kernel[d][tp][1], angle_en[tp]);
�  �   }
�  �   funcy[d] = (1-idp) * angle_en[0] + idp * angle_en[1];
�  �  }
�  �  /* interpolate for depth hardened kernel, */
�  �  energy = d_interp_func(rd,params.num,funcx,funcy,params.clamp)*__sinf (phi);
�  �  //energy = __sinf (phi)/10000.0f*phi_t*iip*idp;
�  �  /* kernel energy is specified for 360 degree ring at phi_t.
�  �  DOSE space is divided into num_theta sections for the 360 degree ring at phi. */
�  �  dval = terma*energy/params.kntheta;
�  �  val+=dval;
�  �  last_rindex = rindex; �  � Â
�   }
�  }
 }
 //printf("val: %d\n",outval[idx]);
 if(idx<params.numpoints)
�  outval[idx]=val;
}
else
{
 if(idx<params.numpoints)
�  outval[idx]=0;
}
}
__device__ float d_interp_func(float x, int num, float *X, float *Y, int clamp)
{  int  i;
  float dx;
  if (num == 0) return(0.0f);
  if (x < X[0]) {
if (clamp) return(Y[0]);
dx = (x - X[0])/(X[1] - X[0]);
return((1.0f - dx)*Y[0] + dx*Y[1]);
  }
  for (i = 1; i < num; i++) {
if (x < X[i]) {
  dx = (x - X[i-1])/(X[i] - X[i-1]);
  return((1.0f - dx)*Y[i-1] + dx*Y[i]);
}
  }
  /* x is greater than or equal to X[num-1] */
  if (clamp) return(Y[num-1]);
  i = num - 1;
  dx = (x - X[i-1])/(X[i] - X[i-1]);
  return((1.0f - dx)*Y[i-1] + dx*Y[i]);
}
Yes… its huge.