Hi,
I recently split my raytracing kernel into two separate ones and am using an intermediary array stored in global memory as the source data for the second kernel to shade. I’ve simplified the code below:
[codebox]
global void shade(int* g_odata, ShadeRec* device_shadeRecs)
{
RGBColour backgroundColour, L;
backgroundColour.r = 0.2f; backgroundColour.g = 0.2f; backgroundColour.b = 0.2f;
L.r = 0.0f; L.g = 0.0f; L.b = 0.0f;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
ShadeRec sr = device_shadeRecs[y*vp.hres+x];
//if (sr.hitAnObject)
//{
// if(sr.material.y == 1)
// L.r = 1;
// //else
// // L = max_to_one(shade(sr, backgroundColour));
//}
//else
//{
// L = backgroundColour;
//}
g_odata[y*vp.hres+x] = rgbToInt(L.r * 255, L.g * 255, L.b * 255);
}
global void render(ShadeRec* device_shadeRecs)
{
Ray ray;
float2 pp;
ray.d.x = 0;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
ray.o = camera.eye;
pp.x = x - 0.5 * vp.hres + 0.5;
pp.y = y - 0.5 * vp.vres + 0.5;
ray.d = camera.getDirection(pp);
ShadeRec sr;// = hitObjects(ray);
sr.ray = ray;
device_shadeRecs[y*vp.hres+x] = sr;
}
//–ptxas-options=-v
void createShadeRecs(int width, int height)
{
CUDA_SAFE_CALL(cudaMalloc( (void**)&device_shadeRecs, sizeof(ShadeRec)*(width*height)));
}
void setupRaytracer(int pbo_in, int pbo_out, int width, int height)
{
int* out_data;
dim3 block(16, 16, 1);
dim3 grid(width / block.x, height / block.y, 1);
CUDA_SAFE_CALL(cudaGLMapBufferObject( (void**)&out_data, pbo_out));
render<<< grid, block>>>(device_shadeRecs);
shade<<< grid, block>>>(out_data, device_shadeRecs);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaGLUnmapBufferObject(pbo_out));
}[/codebox]
I’m only getting about 80fps with nothing being rendered! With my single kernel before with nothing being rendered I was getting in excess of 300fps. Can anyone advise me what the problem is? Should I go back to using one kernel or is there a way around this problem above? Profiler output:
read write overall instruction throughput
render 299 1.76252e+06 79.01 0 53.5976 53.5976 0.0332971
shade 299 468194 20.98 0 1.28264 1.28264 0.0187248