I rewrote a code I had orriginally done in Cg with cuda. With my Cg implementation, I was getting over 300x speedup vs CPU execution, but on my CUDA version, I am only getting about 100x speedup. 100x is still good, but I would think that the preformance should be at least close for a simular implementation. Does anyone have and idea what is going on?
CUDA version
__global__ void e_update(float3 *e, float3 *h, uint3 dim){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
uint3 address = convertAddress_1d_to_3d(idx, dim);
int idx_minus_1x = idx - 1;
int idx_minus_1y = convertAddress_3d_to_1d(make_uint3(address.x, address.y - 1, address.z), dim);
int idx_minus_1z = convertAddress_3d_to_1d(make_uint3(address.x, address.y, address.z - 1), dim);
// these will be changed when upgrade to an anisotropic media
float esctc = 1.0;
float eincc = 0.0;
float edevcn = 0.0;
float ecrl_x = 217.51;
float ecrl_y = 217.51;
float ecrl_z = 217.51;
float ei = 1.0;
float dei = 1.0;
//
e[idx].x = e[idx].x * esctc - eincc * ei - edevcn * dei + \
((h[idx].z - h[idx_minus_1y].z) * ecrl_y) - \
((h[idx].y - h[idx_minus_1z].y) * ecrl_z);
e[idx].y = e[idx].y * esctc - eincc * ei - edevcn * dei + \
((h[idx].x - h[idx_minus_1z].x) * ecrl_z) - \
((h[idx].z - h[idx_minus_1x].z) * ecrl_x);
e[idx].z = e[idx].z * esctc - eincc * ei - edevcn * dei + \
((h[idx].y - h[idx_minus_1x].y) * ecrl_x) - \
((h[idx].x - h[idx_minus_1y].x) * ecrl_y);
}
Cg version
char *e_update_fp_source = \
"float3 e_update(in float2 coords:TEXCOORD0, uniform samplerRECT e_field, uniform samplerRECT h_field, uniform samplerRECT tissues,"\
" samplerRECT ecrl_x, uniform samplerRECT ecrl_y, uniform samplerRECT ecrl_z,"\
" uniform samplerRECT esctc, uniform samplerRECT eincc, uniform samplerRECT edevcn,"\
" uniform float numSlices_x, uniform float numSlices_y, uniform float dim_x, uniform float dim_y):COLOR{"\
" float tissueType = texRECT(tissues, coords);"\
" float2 tissueIndex = float2(0.5, tissueType+0.5);"\
" float2 coords_minus_1x = coords;"\
" coords_minus_1x.x -= 1.0;"\
" float2 coords_minus_1y = coords;"\
" coords_minus_1y.y -= 1.0;"\
" float z = floor(round(coords.x-0.5)/dim_x) + floor(round(coords.y-0.5)/dim_y)*numSlices_x;"\
" float2 coords_minus_1z = coords;"\
" float tmp = fmod(z,numSlices_x);"\
" if(tmp > 0.0) coords_minus_1z.x -= dim_x;"\
" else if(z > 0.0) coords_minus_1z += (dim_x*(numSlices_x-1.0), -dim_y);"\
" float esctc_ = 1.0;"\
" float eincc_ = 0.0;"\
" float edevcn_ = 0.0;"\
" float ecrl_x_ = 217.51;"\
" float ecrl_y_ = 217.51;"\
" float ecrl_z_ = 217.51;"\
" float ei = 1;"\
" float dei = 1;"\
" float3 e = texRECT(e_field, coords);"\
" float3 h = texRECT(h_field, coords);"\
" float h1 = h.z;"\
" float h2 = texRECT(h_field, coords_minus_1y).z;"\
" float h3 = h.y;"\
" float h4 = texRECT(h_field, coords_minus_1z).y;"\
" e.x = e.x * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_y_) - ((h3 - h4) * ecrl_z_);"\
" h1 = h.x;"\
" h2 = texRECT(h_field, coords_minus_1z).x;"\
" h3 = h.z;"\
" h4 = texRECT(h_field, coords_minus_1x).z;"\
" e.y = e.y * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_z_) - ((h3 - h4) * ecrl_x_);"\
" h1 = h.y;"\
" h2 = texRECT(h_field, coords_minus_1x).y;"\
" h3 = h.x;"\
" h4 = texRECT(h_field, coords_minus_1y).x;"\
" e.z = e.z * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_x_) - ((h3 - h4) * ecrl_y_);"\
" if(z == 1.0 && round(coords.y - 0.5) == 1.0 && round(coords.x - 0.5) == 4.0) e.x = 9.0;"\
" return e;"\
"}";