Hi!
I’m new to CUDA-programming and just wanted to try something, but i’ve problems with non-coalesced memory accesses. I wanna write a game a little bit like a mix of scrabble and mahjong.
I’ve got a grid of chars and wanna do some computing. I select one char, compare it with the eight surrounding chars and then write a char-value back at a new grid.
Here’s an example:
–ooo-- --a— x: char to compare
–oxo-- ------ o: comparison value
–ooo-- ------ a: output
That’s the Kernel-code I’ve got yet:
[codebox]#include “function.h”
global void function(unsigned char* source,
unsigned char* target,
size_t spitch,
size_t tpitch,
unsigned short sh,
unsigned short th)
{
// declare variables
unsigned long sp, tp;
unsigned char pos;
unsigned char out[4];
__shared__ unsigned char sector[BLOCK_SIZE_Y][CHARS_PER_THREAD * BLOCK_SIZE_X];
unsigned char col = threadIdx.x;
unsigned char row = threadIdx.y;
// avoid bank conflicts on shared memory
col *= CHARS_PER_THREAD;
// -2, cause of over-lapping
unsigned long x = blockIdx.x * (blockDim.x * CHARS_PER_THREAD - 2) + col;
unsigned long y = blockIdx.y * (blockDim.y - 2) + row;
sp = y*spitch + x;
// load a sector of the grid in shared memory
// each thread loads CHARS_PER_THREAD chars
// avoid out of range
if(y < sh)
{
#pragma unroll
for(unsigned char i=0; i < CHARS_PER_THREAD; i++)
// avoid out of range
if((x + i) < spitch)
sector[row][col + i] = source[ sp + i];
}
// synchronize to be sure, all chars are loaded before computing
__syncthreads();
// compute output value, each thread computes CHARS_PER_THREAD output values
#pragma unroll
for(unsigned char i=0; i < CHARS_PER_THREAD; i++)
{
out[i] = 0;
unsigned char middle = sector[row+1][col+1 + i];
// here goes my algorithm (comparisons and computations of output)
}
// write out values to target, each thread writes CHARS_PER_THREAD chars
// avoid out of range
if(y < th && row < (BLOCK_SIZE_Y - 2))
{
tp = y*tpitch + x;
#pragma unroll
for(unsigned char i=0; i < CHARS_PER_THREAD; i++)
// avoid out of range
if((x + i) < tpitch && (col + i) < (CHARS_PER_THREAD * BLOCK_SIZE_X - 2))
target[ tp + i] = out[i];
}
}[/codebox]
Code-Snippet of allocation and kernel-invocation:
[codebox]// allocate device memory
cutilSafeCall( cudaMallocPitch( (void **)&d_so, (size_t*)&d_spitch, sw, sh ) );
cutilSafeCall( cudaMallocPitch( (void **)&d_to, (size_t*)&d_tpitch, tw, th ) );
// copy host memory to device
cutilSafeCall( cudaMemcpy2D(d_so, d_spitch, so, slf, sw, sh,
cudaMemcpyHostToDevice) );
// execute the kernel and check if kernel execution generated an error
dimBlock = dim3(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dimGrid = dim3( (tw + (PX_PER_THREAD*dimBlock.x-2) - 1) / (PX_PER_THREAD*dimBlock.x-2),
(th + (dimBlock.y-2) - 1) / (dimBlock.y-2));
function<<<dimGrid, dimBlock>>>(d_so, d_to, d_spitch, d_tpitch, sh, th);
cutilCheckMsg("Kernel execution failed");
cudaThreadSynchronize();[/codebox]
If I’m running the program in CUDA visual profiler, I get many non-coalesced loads and stores. I’ve read the Reference Manual, Programming Guide and Best Practise Guide,
but I still don’t get it. I’m using a 9800 GTX+, Windows XP32 and CUDA 2.3. Anybody could help me?