Hi All,
I’m trying to use shared memory to speed up my raytracer by storing some elements of my grid in shared memory. Problem is it’s running slightly slower than before. I profiled it and it is because the following code is serialising the warp. But I’m too noobish to work out why, can anyone help? I’m getting about 2 million warps being serialised lol. If I take out the for loop then the warp serialisations go, so its definitely that piece of code.
[codebox]
shared float s_lbbox[24];
shared float s_bbox[24];
shared int s_ln[12];
shared int s_n[12];
shared int s_loffset[12];
global void render(int* g_odata)
{
for(int i = 0; i < numGrids; i++)
{
s_lbbox[i*6] = tex1Dfetch(gridLBBox, i*6 );
s_lbbox[i*6+1] = tex1Dfetch(gridLBBox, i*6 + 1);
s_lbbox[i*6+2] = tex1Dfetch(gridLBBox, i*6 + 2);
s_lbbox[i*6+3] = tex1Dfetch(gridLBBox, i*6 + 3);
s_lbbox[i*6+4] = tex1Dfetch(gridLBBox, i*6 + 4);
s_lbbox[i*6+5] = tex1Dfetch(gridLBBox, i*6 + 5);
s_bbox[i*6] = tex1Dfetch(gridBBox, i*6 );
s_bbox[i*6+1] = tex1Dfetch(gridBBox, i*6 + 1);
s_bbox[i*6+2] = tex1Dfetch(gridBBox, i*6 + 2);
s_bbox[i*6+3] = tex1Dfetch(gridBBox, i*6 + 3);
s_bbox[i*6+4] = tex1Dfetch(gridBBox, i*6 + 4);
s_bbox[i*6+5] = tex1Dfetch(gridBBox, i*6 + 5);
s_ln[i*3] = tex1Dfetch(gridLN, i*3 );
s_ln[i*3+1] = tex1Dfetch(gridLN, i*3 + 1);
s_ln[i*3+2] = tex1Dfetch(gridLN, i*3 + 2);
s_n[i*3] = tex1Dfetch(gridN, i*3 );
s_n[i*3+1] = tex1Dfetch(gridN, i*3 + 1);
s_n[i*3+2] = tex1Dfetch(gridN, i*3 + 2);
s_loffset[i*3] = tex1Dfetch(gridLOffset, i*3 );
s_loffset[i*3+1] = tex1Dfetch(gridLOffset, i*3 + 1);
s_loffset[i*3+2] = tex1Dfetch(gridLOffset, i*3 + 2);
}
RGBColour L;
L.r = 0; L.g = 0; L.b = 0;
Ray ray;
float2 pp;
ray.d.x = 0;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
ray.o = camera.eye;
pp.x = x - 0.5 * vp.hres + 0.5;
pp.y = y - 0.5 * vp.vres + 0.5;
ray.d = camera.getDirection(pp);
L = traceRay(ray);
g_odata[y*vp.hres+x] = rgbToInt(L.r * 255, L.g * 255, L.b * 255);
}[/codebox]