Very slow memory access / bottleneck? Using two kernels


I recently split my raytracing kernel into two separate ones and am using an intermediary array stored in global memory as the source data for the second kernel to shade. I’ve simplified the code below:


global void shade(int* g_odata, ShadeRec* device_shadeRecs)


RGBColour backgroundColour, L;

backgroundColour.r = 0.2f; backgroundColour.g = 0.2f; backgroundColour.b = 0.2f;

L.r = 0.0f; L.g = 0.0f; L.b = 0.0f;

int x = blockIdx.x*blockDim.x + threadIdx.x;

int y = blockIdx.y*blockDim.y + threadIdx.y;

ShadeRec sr = device_shadeRecs[y*vp.hres+x];

//if (sr.hitAnObject) 


//	if(sr.material.y == 1)

//		L.r = 1;

//	//else

//	//	L = max_to_one(shade(sr, backgroundColour));




//	L = backgroundColour;


g_odata[y*vp.hres+x] = rgbToInt(L.r * 255, L.g * 255, L.b * 255);


global void render(ShadeRec* device_shadeRecs)


Ray ray;

float2 pp;

ray.d.x = 0;

int x = blockIdx.x*blockDim.x + threadIdx.x;

int y = blockIdx.y*blockDim.y + threadIdx.y;

ray.o = camera.eye;

pp.x = x - 0.5 * vp.hres + 0.5; 

pp.y = y - 0.5 * vp.vres + 0.5;

ray.d = camera.getDirection(pp);

ShadeRec sr;// = hitObjects(ray);   

sr.ray = ray;

device_shadeRecs[y*vp.hres+x] = sr;



void createShadeRecs(int width, int height)


CUDA_SAFE_CALL(cudaMalloc( (void**)&device_shadeRecs, sizeof(ShadeRec)*(width*height)));


void setupRaytracer(int pbo_in, int pbo_out, int width, int height)


int* out_data;

dim3 block(16, 16, 1);

dim3 grid(width / block.x, height / block.y, 1);

CUDA_SAFE_CALL(cudaGLMapBufferObject( (void**)&out_data, pbo_out));

render<<< grid, block>>>(device_shadeRecs);

shade<<<  grid, block>>>(out_data, device_shadeRecs);




I’m only getting about 80fps with nothing being rendered! With my single kernel before with nothing being rendered I was getting in excess of 300fps. Can anyone advise me what the problem is? Should I go back to using one kernel or is there a way around this problem above? Profiler output:

                                                                            read         write          overall      instruction throughput

render 299 1.76252e+06 79.01 0 53.5976 53.5976 0.0332971

shade 299 468194 20.98 0 1.28264 1.28264 0.0187248

Anyone? I’m getting very inconsistent memory thoughput, just the other day it was being read at 67gbps now it is down to 1gbps, and I’m sure I’m made no changes whatsoever. I had this problem the other day, spent all day trying to figure out what was causing it, gave up and the next morning when I ran the same executable, it was running a 30fps as opposed the 15fps the previous day! Has anyone got any ideas as to what the hell is going on? I tried restarting the PC just now but it had no effect. I am absolutely sure there’s been no changes to my code as well.