Hi, I have a host 2D data with 100 * 100 dimension. I would like to tile this data into 5x5 sub 2D blocks and do the following on each block:

- sum all the values of that block. ( Sum of values of 5*5=25 cell )
- Set the all cell values to 0 in that 5x5 block
- Put the summed value in the center of that block

Here is my code but I dont have concrete idea to fill the kernal code with the optimal running functions ( i.e. sure pSubBlock[ i * j ] is not correct and what should be written. Also is there a better single function to sum up all the block values instead of two loops ):

**global** void Sum_Kernel(unsigned char* pSubBlock)

{

// find the sum of values in 5x5 block

int p_Sum = 0;

for (int i = 0; i < 5; i++)

for (int j = 0; j < 5; j++)

p_Sum += pSubBlock[ i * j ];

// Set 0 to the rest of the block cells

for (int i = 0; i < 5; i++)

for (int j = 0; j < 5; j++)

pSubBlock[ i * j ] = 0;

// Put the sum in the center of the block ( for 5x5 block, 2,2 is the center )

pSubBlock[ 2 * 2 ] = p_Sum;

}

CUT_DEVICE_INIT(0, 0);

unsigned char* p_HostMem = new unsigned char[ 100 * 100 ];

unsigned char* p_DevMem;

CUDA_SAFE_CALL( cudaMalloc((void**)&p_DevMem, 100 * 100 * sizeof(unsigned char)) );

// … here fill the p_HostMem with random 0s and 1s

CUDA_SAFE_CALL(cudaMemcpy(p_DevMem, p_HostMem, 100 * 100 * sizeof(unsigned char), cudaMemcpyHostToDevice));

Sum_Kernel<<<5, 5>>>( p_DevMem );

CUDA_SAFE_CALL(cudaMemcpy(p_HostMem, p_DevMem, 100 * 100 * sizeof(unsigned char), cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL( cudaFree(p_DevMem) );

// … use p_HostMem

delete p_HostMem;

CUT_EXIT(0, 0);