I wrote a CUDA C program for averaging the elements in a matrix. In the program below, I’m giving a 16x16 matrix and I created a grid of 8x8 blocks. The elements of the matrix is distributed to 64 blocks and each block have 2 threads along x direction, which adds elements in y direction.

Here, first thread adds elements mat1[0,0] and mat1[1,0] and stored to the shared memory sum_thread[0]. Likewise, second thread adds elements mat1[0,1] and mat1[1,1] and stored to the shared memory sum_thread[1]. Then the sum_thread[0], sum_thread[1] of every blocks are added together to get the total sum. Then it is divided by total number of elements in host side.

One problem is with the indexing of blocks and threads. Printing value in same memory location gives two results. Check the output sum_thread[64] = 7.000000 and sum_thread_2[64] = 0.000000. The prgram is giving false value from sum_thread_2[64] onwards.

PROGRAM:

#include <stdio.h>

global void avg(float *mat_ptr, float *mat_avg, int width, int height)
{
shared float sum_block ;

``````int b_x = blockDim.x* blockIdx.x;
int b_y = blockDim.y* blockIdx.y;
int t_x = b_x + threadIdx.x;
int t_y = b_y + threadIdx.y;
int shared_mem_index   = (height/8)*t_y*width + t_x;
int shared_block_index = (height/8)*b_y*width + b_x;

if (shared_mem_index < (width*height))
for(int i = 0; i < (height/8); i++)
{
}

if(shared_block_index < (width*height))
for(int j=0; j < (width/2) ; j++)
{
}

{
printf("sum_block (%d) = %f\n",shared_block_index, sum_block);
}

{
}
``````

}

int main()
{
const int width = 16;
const int height = 16;

``````float mat1[height][width] ={{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};

float cpu_avg;

float *cuda_mat;
float *cuda_avg;

//Allocate device memory
cudaMalloc(&cuda_avg,sizeof(float));
cudaMalloc(&cuda_mat,sizeof(mat1));

// Copy result from host memory to device memory
cudaMemcpy(cuda_mat,mat1,sizeof(mat1),cudaMemcpyHostToDevice);

printf("Threads per block = %d\n", width/8);
dim3 blocksPerGrid(8,8,1);

// Invoke kernel
avg<<<blocksPerGrid, threadsPerBlock, width/8 >>>(cuda_mat, cuda_avg, width, height);

// Copy result from device memory to host memory
cudaMemcpy(&cpu_avg, cuda_avg, sizeof(float), cudaMemcpyDeviceToHost);

printf("Sum = %f\n", cpu_avg);
printf("Average = %f\n", cpu_avg/(width*height));

// Free device memory
cudaFree(cuda_mat);
cudaFree(cuda_avg);

return 0;
``````

}

OUTPUT:

sum_block (38) = 10.000000
sum_block (74) = 0.000000
sum_block (66) = 0.000000
sum_block (72) = 0.000000
sum_block (44) = 10.000000
sum_block (108) = 0.000000
sum_block (42) = 10.000000
sum_block (104) = 0.000000
sum_block (34) = 10.000000
sum_block (106) = 0.000000
sum_block (70) = 0.000000
sum_block (98) = 0.000000
sum_block (40) = 10.000000
sum_block (78) = 0.000000
sum_block (138) = 0.000000
sum_block (136) = 0.000000
sum_block (140) = 0.000000
sum_block (68) = 0.000000
sum_block (36) = 10.000000
sum_block (64) = 0.000000
sum_block (76) = 0.000000
sum_block (110) = 0.000000
sum_block (46) = 10.000000 <<= Issue: Woking till sum_block (46)
sum_block (130) = 0.000000
sum_block (100) = 0.000000
sum_block (96) = 0.000000
sum_block (142) = 0.000000
sum_block (102) = 0.000000
sum_block (168) = 0.000000
sum_block (132) = 0.000000
sum_block (32) = 10.000000
sum_block (172) = 0.000000
sum_block (134) = 0.000000
sum_block (162) = 0.000000
sum_block (128) = 0.000000
sum_block (160) = 0.000000
sum_block (170) = 0.000000
sum_block (174) = 0.000000
sum_block (166) = 0.000000
sum_block (200) = 0.000000
sum_block (164) = 0.000000
sum_block (192) = 0.000000
sum_block (6) = 6.000000
sum_block (204) = 0.000000
sum_block (202) = 0.000000
sum_block (194) = 0.000000
sum_block (198) = 0.000000
sum_block (206) = 0.000000
sum_block (196) = 0.000000
sum_block (12) = 6.000000
sum_block (4) = 6.000000
sum_block (14) = 6.000000
sum_block (232) = 0.000000
sum_block (224) = 0.000000
sum_block (226) = 0.000000
sum_block (8) = 6.000000
sum_block (234) = 0.000000
sum_block (2) = 6.000000
sum_block (236) = 0.000000
sum_block (0) = 6.000000
sum_block (10) = 6.000000
sum_block (238) = 0.000000
sum_block (228) = 0.000000
sum_block (230) = 0.000000
Sum = 0.000000
Average = 0.000000