Issue with addition of shared memory and thread indexing

I wrote a CUDA C program for averaging the elements in a matrix. In the program below, I’m giving a 16x16 matrix and I created a grid of 8x8 blocks. The elements of the matrix is distributed to 64 blocks and each block have 2 threads along x direction, which adds elements in y direction.

Here, first thread adds elements mat1[0,0] and mat1[1,0] and stored to the shared memory sum_thread[0]. Likewise, second thread adds elements mat1[0,1] and mat1[1,1] and stored to the shared memory sum_thread[1]. Then the sum_thread[0], sum_thread[1] of every blocks are added together to get the total sum. Then it is divided by total number of elements in host side.

One problem is with the indexing of blocks and threads. Printing value in same memory location gives two results. Check the output sum_thread[64] = 7.000000 and sum_thread_2[64] = 0.000000. The prgram is giving false value from sum_thread_2[64] onwards.

Second problem is with addition of these shared memories, ie sum_block[0] = sum_thread[0] + sum_thread[1]

Please help me to fix this issue? If I’m wrong at any concepts, please correct me too.

PROGRAM:

#include <stdio.h>

global void avg(float *mat_ptr, float *mat_avg, int width, int height)
{
extern shared float sum_thread;
shared float sum_block ;

int b_x = blockDim.x* blockIdx.x;
int b_y = blockDim.y* blockIdx.y;
int t_x = b_x + threadIdx.x;
int t_y = b_y + threadIdx.y;
int shared_mem_index   = (height/8)*t_y*width + t_x;
int shared_block_index = (height/8)*b_y*width + b_x;


if (shared_mem_index < (width*height))
    for(int i = 0; i < (height/8); i++)
    {   
        sum_thread[shared_mem_index] += mat_ptr[shared_mem_index + i*width];  
    }    

printf("sum_thread[%d] = %f\n",shared_mem_index , sum_thread[shared_mem_index]);

printf("sum_thread_2[%d] = %f\n",shared_block_index , sum_thread[shared_block_index]);


if(shared_block_index < (width*height))
    for(int j=0; j < (width/2) ; j++)
    {
        sum_block += sum_thread[shared_block_index + j];
    }

if(threadIdx.x ==0 && threadIdx.y ==0)
{
    printf("sum_block (%d) = %f\n",shared_block_index, sum_block);
}

if(threadIdx.x ==0 && threadIdx.y ==0)
{
    atomicAdd(mat_avg, sum_block);
}

}

int main()
{
const int width = 16;
const int height = 16;

float mat1[height][width] ={{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
                            {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                            {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
                            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};

float cpu_avg;  

float *cuda_mat;
float *cuda_avg;

//Allocate device memory
cudaMalloc(&cuda_avg,sizeof(float));
cudaMalloc(&cuda_mat,sizeof(mat1));

// Copy result from host memory to device memory
cudaMemcpy(cuda_mat,mat1,sizeof(mat1),cudaMemcpyHostToDevice);

dim3 threadsPerBlock(width/8,1,1);
printf("Threads per block = %d\n", width/8);
dim3 blocksPerGrid(8,8,1);

// Invoke kernel
avg<<<blocksPerGrid, threadsPerBlock, width/8 >>>(cuda_mat, cuda_avg, width, height);

// Copy result from device memory to host memory
cudaMemcpy(&cpu_avg, cuda_avg, sizeof(float), cudaMemcpyDeviceToHost);

printf("Sum = %f\n", cpu_avg);
printf("Average = %f\n", cpu_avg/(width*height));

// Free device memory
cudaFree(cuda_mat);
cudaFree(cuda_avg);


return 0;

}

OUTPUT:

Threads per block = 2
sum_thread[2] = 3.000000
sum_thread[3] = 3.000000
sum_thread[42] = 5.000000
sum_thread[43] = 5.000000
sum_thread[104] = 2.000000
sum_thread[105] = 2.000000
sum_thread[74] = 7.000000
sum_thread[75] = 7.000000
sum_thread[38] = 5.000000
sum_thread[39] = 5.000000
sum_thread[8] = 3.000000
sum_thread[9] = 3.000000
sum_thread[40] = 5.000000
sum_thread[41] = 5.000000
sum_thread[6] = 3.000000
sum_thread[7] = 3.000000
sum_thread[10] = 3.000000
sum_thread[11] = 3.000000
sum_thread[34] = 5.000000
sum_thread[35] = 5.000000
sum_thread[70] = 7.000000
sum_thread[71] = 7.000000
sum_thread[14] = 3.000000
sum_thread[15] = 3.000000
sum_thread[66] = 7.000000
sum_thread[67] = 7.000000
sum_thread[106] = 2.000000
sum_thread[107] = 2.000000
sum_thread[136] = 10.000000
sum_thread[137] = 10.000000
sum_thread[12] = 3.000000
sum_thread[13] = 3.000000
sum_thread[138] = 10.000000
sum_thread[139] = 10.000000
sum_thread[98] = 2.000000
sum_thread[99] = 2.000000
sum_thread[36] = 5.000000
sum_thread[37] = 5.000000
sum_thread[72] = 7.000000
sum_thread[73] = 7.000000
sum_thread[0] = 3.000000
sum_thread[1] = 3.000000
sum_thread[110] = 2.000000
sum_thread[111] = 2.000000
sum_thread[4] = 3.000000
sum_thread[5] = 3.000000
sum_thread[168] = 3.000000
sum_thread[169] = 3.000000
sum_thread[78] = 7.000000
sum_thread[79] = 7.000000
sum_thread[44] = 5.000000
sum_thread[45] = 5.000000
sum_thread[64] = 7.000000 <<= Issue: sum_thread[64] and sum_thread_2[64] value differs
sum_thread[65] = 7.000000
sum_thread[46] = 5.000000
sum_thread[47] = 5.000000
sum_thread[32] = 5.000000
sum_thread[33] = 5.000000
sum_thread[108] = 2.000000
sum_thread[109] = 2.000000
sum_thread[132] = 10.000000
sum_thread[133] = 10.000000
sum_thread[100] = 2.000000
sum_thread[101] = 2.000000
sum_thread[96] = 2.000000
sum_thread[97] = 2.000000
sum_thread[140] = 10.000000
sum_thread[141] = 10.000000
sum_thread[68] = 7.000000
sum_thread[69] = 7.000000
sum_thread[170] = 3.000000
sum_thread[171] = 3.000000
sum_thread[76] = 7.000000
sum_thread[77] = 7.000000
sum_thread[160] = 3.000000
sum_thread[161] = 3.000000
sum_thread[130] = 10.000000
sum_thread[131] = 10.000000
sum_thread[174] = 3.000000
sum_thread[175] = 3.000000
sum_thread[134] = 10.000000
sum_thread[135] = 10.000000
sum_thread[142] = 10.000000
sum_thread[143] = 10.000000
sum_thread[166] = 3.000000
sum_thread[167] = 3.000000
sum_thread[162] = 3.000000
sum_thread[163] = 3.000000
sum_thread[102] = 2.000000
sum_thread[103] = 2.000000
sum_thread[128] = 10.000000
sum_thread[129] = 10.000000
sum_thread[200] = 4.000000
sum_thread[201] = 4.000000
sum_thread[172] = 3.000000
sum_thread[173] = 3.000000
sum_thread[164] = 3.000000
sum_thread[165] = 3.000000
sum_thread[204] = 4.000000
sum_thread[205] = 4.000000
sum_thread[198] = 4.000000
sum_thread[199] = 4.000000
sum_thread[206] = 4.000000
sum_thread[207] = 4.000000
sum_thread[202] = 4.000000
sum_thread[203] = 4.000000
sum_thread[192] = 4.000000
sum_thread[193] = 4.000000
sum_thread[196] = 4.000000
sum_thread[197] = 4.000000
sum_thread[230] = 6.000000
sum_thread[231] = 6.000000
sum_thread[236] = 6.000000
sum_thread[237] = 6.000000
sum_thread[194] = 4.000000
sum_thread[195] = 4.000000
sum_thread[232] = 6.000000
sum_thread[233] = 6.000000
sum_thread[234] = 6.000000
sum_thread[235] = 6.000000
sum_thread[228] = 6.000000
sum_thread[229] = 6.000000
sum_thread[238] = 6.000000
sum_thread[239] = 6.000000
sum_thread[226] = 6.000000
sum_thread[227] = 6.000000
sum_thread[224] = 6.000000
sum_thread[225] = 6.000000
sum_thread_2[6] = 3.000000
sum_thread_2[6] = 3.000000
sum_thread_2[34] = 5.000000
sum_thread_2[34] = 5.000000
sum_thread_2[42] = 5.000000
sum_thread_2[42] = 5.000000
sum_thread_2[38] = 5.000000
sum_thread_2[38] = 5.000000
sum_thread_2[66] = 0.000000
sum_thread_2[66] = 0.000000
sum_thread_2[40] = 5.000000
sum_thread_2[40] = 5.000000
sum_thread_2[74] = 0.000000
sum_thread_2[74] = 0.000000
sum_thread_2[104] = 0.000000
sum_thread_2[104] = 0.000000
sum_thread_2[70] = 0.000000
sum_thread_2[70] = 0.000000
sum_thread_2[12] = 3.000000
sum_thread_2[12] = 3.000000
sum_thread_2[14] = 3.000000
sum_thread_2[14] = 3.000000
sum_thread_2[98] = 0.000000
sum_thread_2[98] = 0.000000
sum_thread_2[72] = 0.000000
sum_thread_2[72] = 0.000000
sum_thread_2[44] = 5.000000
sum_thread_2[44] = 5.000000
sum_thread_2[78] = 0.000000
sum_thread_2[78] = 0.000000
sum_thread_2[106] = 0.000000
sum_thread_2[106] = 0.000000
sum_thread_2[136] = 0.000000
sum_thread_2[136] = 0.000000
sum_thread_2[138] = 0.000000
sum_thread_2[138] = 0.000000
sum_thread_2[4] = 3.000000
sum_thread_2[4] = 3.000000
sum_thread_2[108] = 0.000000
sum_thread_2[108] = 0.000000
sum_thread_2[36] = 5.000000
sum_thread_2[36] = 5.000000
sum_thread_2[68] = 0.000000
sum_thread_2[68] = 0.000000
sum_thread_2[110] = 0.000000
sum_thread_2[110] = 0.000000
sum_thread_2[64] = 0.000000 <<= Issue: sum_thread_2[64] and sum_thread[64] value differs.
sum_thread_2[64] = 0.000000
sum_thread_2[96] = 0.000000
sum_thread_2[96] = 0.000000
sum_thread_2[46] = 5.000000 <<= Working till sum_thread_2[46]. It is similar to sum_thread[46], only indexing variable changed.
sum_thread_2[46] = 5.000000
sum_thread_2[140] = 0.000000
sum_thread_2[140] = 0.000000
sum_thread_2[76] = 0.000000
sum_thread_2[76] = 0.000000
sum_thread_2[32] = 5.000000
sum_thread_2[32] = 5.000000
sum_thread_2[142] = 0.000000
sum_thread_2[142] = 0.000000
sum_thread_2[100] = 0.000000
sum_thread_2[100] = 0.000000
sum_thread_2[130] = 0.000000
sum_thread_2[130] = 0.000000
sum_thread_2[160] = 0.000000
sum_thread_2[160] = 0.000000
sum_thread_2[132] = 0.000000
sum_thread_2[132] = 0.000000
sum_thread_2[168] = 0.000000
sum_thread_2[168] = 0.000000
sum_thread_2[102] = 0.000000
sum_thread_2[102] = 0.000000
sum_thread_2[134] = 0.000000
sum_thread_2[134] = 0.000000
sum_thread_2[170] = 0.000000
sum_thread_2[170] = 0.000000
sum_thread_2[128] = 0.000000
sum_thread_2[128] = 0.000000
sum_thread_2[172] = 0.000000
sum_thread_2[172] = 0.000000
sum_thread_2[162] = 0.000000
sum_thread_2[162] = 0.000000
sum_thread_2[174] = 0.000000
sum_thread_2[174] = 0.000000
sum_thread_2[166] = 0.000000
sum_thread_2[166] = 0.000000
sum_thread_2[200] = 0.000000
sum_thread_2[200] = 0.000000
sum_thread_2[164] = 0.000000
sum_thread_2[164] = 0.000000
sum_thread_2[192] = 0.000000
sum_thread_2[192] = 0.000000
sum_thread_2[202] = 0.000000
sum_thread_2[202] = 0.000000
sum_thread_2[194] = 0.000000
sum_thread_2[194] = 0.000000
sum_thread_2[198] = 0.000000
sum_thread_2[198] = 0.000000
sum_thread_2[204] = 0.000000
sum_thread_2[204] = 0.000000
sum_thread_2[8] = 3.000000
sum_thread_2[8] = 3.000000
sum_thread_2[206] = 0.000000
sum_thread_2[206] = 0.000000
sum_thread_2[196] = 0.000000
sum_thread_2[196] = 0.000000
sum_thread_2[2] = 3.000000
sum_thread_2[2] = 3.000000
sum_thread_2[10] = 3.000000
sum_thread_2[10] = 3.000000
sum_thread_2[232] = 0.000000
sum_thread_2[232] = 0.000000
sum_thread_2[224] = 0.000000
sum_thread_2[224] = 0.000000
sum_thread_2[0] = 3.000000
sum_thread_2[0] = 3.000000
sum_thread_2[226] = 0.000000
sum_thread_2[226] = 0.000000
sum_thread_2[234] = 0.000000
sum_thread_2[234] = 0.000000
sum_thread_2[236] = 0.000000
sum_thread_2[236] = 0.000000
sum_thread_2[238] = 0.000000
sum_thread_2[238] = 0.000000
sum_thread_2[228] = 0.000000
sum_thread_2[228] = 0.000000
sum_thread_2[230] = 0.000000
sum_thread_2[230] = 0.000000
sum_block (38) = 10.000000
sum_block (74) = 0.000000
sum_block (66) = 0.000000
sum_block (72) = 0.000000
sum_block (44) = 10.000000
sum_block (108) = 0.000000
sum_block (42) = 10.000000
sum_block (104) = 0.000000
sum_block (34) = 10.000000
sum_block (106) = 0.000000
sum_block (70) = 0.000000
sum_block (98) = 0.000000
sum_block (40) = 10.000000
sum_block (78) = 0.000000
sum_block (138) = 0.000000
sum_block (136) = 0.000000
sum_block (140) = 0.000000
sum_block (68) = 0.000000
sum_block (36) = 10.000000
sum_block (64) = 0.000000
sum_block (76) = 0.000000
sum_block (110) = 0.000000
sum_block (46) = 10.000000 <<= Issue: Woking till sum_block (46)
sum_block (130) = 0.000000
sum_block (100) = 0.000000
sum_block (96) = 0.000000
sum_block (142) = 0.000000
sum_block (102) = 0.000000
sum_block (168) = 0.000000
sum_block (132) = 0.000000
sum_block (32) = 10.000000
sum_block (172) = 0.000000
sum_block (134) = 0.000000
sum_block (162) = 0.000000
sum_block (128) = 0.000000
sum_block (160) = 0.000000
sum_block (170) = 0.000000
sum_block (174) = 0.000000
sum_block (166) = 0.000000
sum_block (200) = 0.000000
sum_block (164) = 0.000000
sum_block (192) = 0.000000
sum_block (6) = 6.000000
sum_block (204) = 0.000000
sum_block (202) = 0.000000
sum_block (194) = 0.000000
sum_block (198) = 0.000000
sum_block (206) = 0.000000
sum_block (196) = 0.000000
sum_block (12) = 6.000000
sum_block (4) = 6.000000
sum_block (14) = 6.000000
sum_block (232) = 0.000000
sum_block (224) = 0.000000
sum_block (226) = 0.000000
sum_block (8) = 6.000000
sum_block (234) = 0.000000
sum_block (2) = 6.000000
sum_block (236) = 0.000000
sum_block (0) = 6.000000
sum_block (10) = 6.000000
sum_block (238) = 0.000000
sum_block (228) = 0.000000
sum_block (230) = 0.000000
Sum = 0.000000
Average = 0.000000