Hi i want to coalesce my memory access. Scenario is following. I have a big array of unsigned chars, kernel has to read 3 consecutive values in that array, compute something and return 3 consecutive floats into an output array.
My original approach was:
[codebox]void global kernel(uchar *in, float *out, long int size) {
int idx = ((blockIdx.x * blockDim.x) + threadIdx.x) * 3;
if(idx+3 < size) {
float r, g, b;
float y, u, v;
b = (float)in[idx + 0];
g = (float)in[idx + 1];
r = (float)in[idx + 2];
y = <calculations_using_bcg>
u = <calculations_using_bcg>
v = <calculations_using_bcg>
out[idx + 0] = y;
out[idx + 1] = u;
out[idx + 2] = v;
}
}[/codebox]
and in the host code:
[codebox]
// memSize size is about 36,000,000
cudaMalloc((void **) &d_in_uchar, memSize * sizeof(uchar)); //device memory - in data
cudaMemcpy(d_in_uchar, img->uchar_data, memSize * sizeof(uchar), cudaMemcpyHostToDevice); //copy image to the device
cudaMalloc((void **) &d_out_float, memSize * sizeof(float)); //output memory - out data
kernel<<< dimGrid, dimBlock >>>( d_in_uchar , d_out_float, memSize );
[/codebox]
It had preety good efficiency ~54ms for the memSize indicated above
Then i read about coalescing memory access. In CUDA Technical Vol I training there is an example with float3 access coalescing, almost identical to my case.
And so i did following:
[codebox]
// #BLOCK_VOL is 16^2 = 256
void global kernel(uchar *in, float *out, long int size) {
int block_offset = blockIdx.x * blockDim.x * 3; //this is the address of our current grid
__shared__ float s_mem[BLOCK_VOL*3];
if((block_offset + threadIdx.x*3) < size) {
float r, g, b;
// float y, u, v;
s_mem[threadIdx.x] = (float)in[block_offset + threadIdx.x];
s_mem[threadIdx.x + 1*BLOCK_VOL] = (float)in[block_offset + threadIdx.x + 1*BLOCK_VOL];
s_mem[threadIdx.x + 2*BLOCK_VOL] = (float)in[block_offset + threadIdx.x + 2*BLOCK_VOL];
__syncthreads();
b = s_mem[threadIdx.x*3 + 0];
g = s_mem[threadIdx.x*3 + 1];
r = s_mem[threadIdx.x*3 + 2];
s_mem[threadIdx.x*3 + 0] = <do_the_same_calculations_with_bgr_as before>
s_mem[threadIdx.x*3 + 1] = <do_the_same_calculations_with_bgr_as before>
s_mem[threadIdx.x*3 + 2] = <do_the_same_calculations_with_bgr_as before>
__syncthreads();
out[block_offset + threadIdx.x] = s_mem[threadIdx.x];
out[block_offset + threadIdx.x + 1*BLOCK_VOL] = s_mem[threadIdx.x + 1*BLOCK_VOL];
out[block_offset + threadIdx.x + 2*BLOCK_VOL] = s_mem[threadIdx.x + 2*BLOCK_VOL];
}
}[/codebox]
The execution in host code is the same. But the code runs the SAME (or a little bit slower), time-wise, as before. The kernel execution is actually slower :(, but somehow the malloc+memcpy takes less time :P.
What the hell am i doing wrong?
Or is the access in the first case already coalesced? But it’s imposible right?
Help!