Hi, I have a host 2D data with 100 * 100 dimension. I would like to tile this data into 5x5 sub 2D blocks and do the following on each block:
- sum all the values of that block. ( Sum of values of 5*5=25 cell )
- Set the all cell values to 0 in that 5x5 block
- Put the summed value in the center of that block
Here is my code but I dont have concrete idea to fill the kernal code with the optimal running functions ( i.e. sure pSubBlock[ i * j ] is not correct and what should be written. Also is there a better single function to sum up all the block values instead of two loops ):
global void Sum_Kernel(unsigned char* pSubBlock)
{
// find the sum of values in 5x5 block
int p_Sum = 0;
for (int i = 0; i < 5; i++)
for (int j = 0; j < 5; j++)
p_Sum += pSubBlock[ i * j ];
// Set 0 to the rest of the block cells
for (int i = 0; i < 5; i++)
for (int j = 0; j < 5; j++)
pSubBlock[ i * j ] = 0;
// Put the sum in the center of the block ( for 5x5 block, 2,2 is the center )
pSubBlock[ 2 * 2 ] = p_Sum;
}
CUT_DEVICE_INIT(0, 0);
unsigned char* p_HostMem = new unsigned char[ 100 * 100 ];
unsigned char* p_DevMem;
CUDA_SAFE_CALL( cudaMalloc((void**)&p_DevMem, 100 * 100 * sizeof(unsigned char)) );
// … here fill the p_HostMem with random 0s and 1s
CUDA_SAFE_CALL(cudaMemcpy(p_DevMem, p_HostMem, 100 * 100 * sizeof(unsigned char), cudaMemcpyHostToDevice));
Sum_Kernel<<<5, 5>>>( p_DevMem );
CUDA_SAFE_CALL(cudaMemcpy(p_HostMem, p_DevMem, 100 * 100 * sizeof(unsigned char), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaFree(p_DevMem) );
// … use p_HostMem
delete p_HostMem;
CUT_EXIT(0, 0);