cuda code much slower than Cg version

I rewrote a code I had orriginally done in Cg with cuda. With my Cg implementation, I was getting over 300x speedup vs CPU execution, but on my CUDA version, I am only getting about 100x speedup. 100x is still good, but I would think that the preformance should be at least close for a simular implementation. Does anyone have and idea what is going on?

CUDA version

__global__ void e_update(float3 *e, float3 *h, uint3 dim){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

        uint3 address = convertAddress_1d_to_3d(idx, dim);

        int idx_minus_1x = idx - 1;

        int idx_minus_1y = convertAddress_3d_to_1d(make_uint3(address.x, address.y - 1, address.z), dim);

        int idx_minus_1z = convertAddress_3d_to_1d(make_uint3(address.x, address.y, address.z - 1), dim);

        // these will be changed when upgrade to an anisotropic media

        float esctc = 1.0;

        float eincc = 0.0;

        float edevcn = 0.0;

        float ecrl_x = 217.51;

        float ecrl_y = 217.51;

        float ecrl_z = 217.51;

        float ei = 1.0;

        float dei = 1.0;

        //

        e[idx].x = e[idx].x * esctc - eincc * ei - edevcn * dei + \

                        ((h[idx].z - h[idx_minus_1y].z) * ecrl_y) - \

                        ((h[idx].y - h[idx_minus_1z].y) * ecrl_z);

        e[idx].y = e[idx].y * esctc - eincc * ei - edevcn * dei + \

                        ((h[idx].x - h[idx_minus_1z].x) * ecrl_z) - \

                        ((h[idx].z - h[idx_minus_1x].z) * ecrl_x);

        e[idx].z = e[idx].z * esctc - eincc * ei - edevcn * dei + \

                        ((h[idx].y - h[idx_minus_1x].y) * ecrl_x) - \

                        ((h[idx].x - h[idx_minus_1y].x) * ecrl_y);

}

Cg version

char *e_update_fp_source = \

"float3 e_update(in float2 coords:TEXCOORD0, uniform samplerRECT e_field, uniform samplerRECT h_field, uniform samplerRECT tissues,"\

"               samplerRECT ecrl_x, uniform samplerRECT ecrl_y, uniform samplerRECT ecrl_z,"\

"               uniform samplerRECT esctc, uniform samplerRECT eincc, uniform samplerRECT edevcn,"\

"               uniform float numSlices_x, uniform float numSlices_y, uniform float dim_x, uniform float dim_y):COLOR{"\

"       float tissueType = texRECT(tissues, coords);"\

"       float2 tissueIndex = float2(0.5, tissueType+0.5);"\

"       float2 coords_minus_1x = coords;"\

"       coords_minus_1x.x -= 1.0;"\

"       float2 coords_minus_1y = coords;"\

"       coords_minus_1y.y -= 1.0;"\

"       float z = floor(round(coords.x-0.5)/dim_x) + floor(round(coords.y-0.5)/dim_y)*numSlices_x;"\

"       float2 coords_minus_1z = coords;"\

"       float tmp = fmod(z,numSlices_x);"\

"       if(tmp > 0.0) coords_minus_1z.x -= dim_x;"\

"       else if(z > 0.0) coords_minus_1z += (dim_x*(numSlices_x-1.0), -dim_y);"\

"       float esctc_ = 1.0;"\

"       float eincc_ = 0.0;"\

"       float edevcn_ = 0.0;"\

"       float ecrl_x_ = 217.51;"\

"       float ecrl_y_ = 217.51;"\

"       float ecrl_z_ = 217.51;"\

"       float ei = 1;"\

"       float dei = 1;"\

"       float3 e = texRECT(e_field, coords);"\

"       float3 h = texRECT(h_field, coords);"\

"       float h1 = h.z;"\

"       float h2 = texRECT(h_field, coords_minus_1y).z;"\

"       float h3 = h.y;"\

"       float h4 = texRECT(h_field, coords_minus_1z).y;"\

"       e.x = e.x * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_y_) - ((h3 - h4) * ecrl_z_);"\

"       h1 = h.x;"\

"       h2 = texRECT(h_field, coords_minus_1z).x;"\

"       h3 = h.z;"\

"       h4 = texRECT(h_field, coords_minus_1x).z;"\

"       e.y = e.y * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_z_) - ((h3 - h4) * ecrl_x_);"\

"       h1 = h.y;"\

"       h2 = texRECT(h_field, coords_minus_1x).y;"\

"       h3 = h.x;"\

"       h4 = texRECT(h_field, coords_minus_1y).x;"\

"       e.z = e.z * esctc_ - eincc_ * ei - edevcn_ * dei + ((h1 - h2) * ecrl_x_) - ((h3 - h4) * ecrl_y_);"\

"       if(z == 1.0 && round(coords.y - 0.5) ==  1.0 && round(coords.x - 0.5) == 4.0) e.x = 9.0;"\

"       return e;"\

"}";

You are reading/writing float3, this will result in uncoalesced read/write.
You can use shared memory to achieve coalesced read/write, look around slide 15 in the optimization talk at SC07 ( [url=“http://www.gpgpu.org/sc2007/SC07_CUDA_5_Optimization_Harris.pdf”]http://www.gpgpu.org/sc2007/SC07_CUDA_5_Op...tion_Harris.pdf[/url] )

Looking at slide 19, I see how that works when we are taking g_in[index] += 2 and writing it to g_out[index]. For my code I think I might have a problem with this though because it needs to gather memory from indexes that would be out side of the index range of the current thread block. Perhaps the best I could do is to do coalesced reads and writes for everything that matches read-write index to index, and for the three gather reads (that I don’t know how to coaless), read them uncoalesced with a penalty hit. I think that should help.

Reading even a single float per block uncoalesced will hurt performance quite a bit. You would be better off using a texture to read something you can’t coalesce.