Help DotVectors Cuda

hallo ive tried and created my own code for dotVectors in cuda lang and
after all always i get “0” as a result and this is my code
thanks in advance for helping me out
TAB__A[ 1 , 4 , 9 , 8 , 2 , 5 , 1 , 1 , 5 , 7 ,]
TAB__B[ 7 , 0 , 4 , 8 , 4 , 5 , 7 , 1 , 2 , 6 ,]
We Calculate : 1 * 7 + 4 * 0 + 9 * 4 + 8 * 8 + 2 * 4 + 5 * 5 + 1 * 7 + 1 * 1 +
5 * 2 + 7 * 6
THE SUM IS : 0

#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>

#define SIZE 10
#define min(a,b)    (a<b?a:b)

const int threadperBlock = 256;
const int blockperGrid = min(32 , (SIZE+threadperBlock-1)/threadperBlock);

__global__ void  DotVector(int *a , int *b , int *c){
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	__shared__ int cache[threadperBlock];
	  int outline  =  blockIdx.x * gridDim.x;
	  int cacheIdx = threadIdx.x;
	  int temp = 0;
	  while(tid<SIZE){

		  temp += a[tid] * b[tid];
		  tid += outline ; 
		  		  	  }

	  cache[cacheIdx] = temp;
	  __syncthreads();

	  int i = blockDim.x/2;

	  while(i!=0){
		  if(cacheIdx<i){
			  cache[cacheIdx] += cache[cacheIdx + 1 ];
			  __syncthreads();

			  i/=2;

		  }

		  if(cacheIdx == 0 ){
			  c[blockIdx.x] = cache[0];
		  }

	  }

}

int main(){

   int Sum = 0 ; 
   int *a , *b, *c ; 
   int *d_a, *d_b, *d_c ;
 
   a=(int *)malloc(SIZE*sizeof(int));
   b=(int *)malloc(SIZE*sizeof(int));
   c=(int *)malloc(blockperGrid*sizeof(int));

cudaMalloc(&d_a,SIZE*sizeof(int));
   cudaMalloc(&d_b,SIZE*sizeof(int));
   cudaMalloc(&d_c,blockperGrid*sizeof(int));

   for (int i = 0; i < SIZE; i++) {

     a[i] = rand() %10;
     b[i] = rand() %10;

     c[i] =0;
     
   }

   cudaMemcpy(d_a ,a , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
   cudaMemcpy(d_b ,b , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
   cudaMemcpy(d_c ,c , blockperGrid*sizeof(int) ,cudaMemcpyHostToDevice);

DotVector<<<blockperGrid,threadperBlock>>>(d_a,d_b,d_c);

cudaMemcpy(c,d_c,blockperGrid*sizeof(int),cudaMemcpyDeviceToHost);

for(int i = 0 ; i<blockperGrid; i++){
	   printf(" %d \n" , c[i]);
	   Sum += c[i];
	   
   }
   printf("TAB__A[ ");
   for(int i = 0 ; i<SIZE; i++){

	   printf(" %d ," , a[i]);
	   
   }
   printf("] \n" );

   printf("TAB__B[ ");
      for(int i = 0 ; i<SIZE; i++){

	   printf(" %d ," , b[i]);
	   
   }
   printf("] \n" );

   printf("We Calculate : \t" );
      for(int i = 0 ; i<SIZE; i++){
		  if(i == SIZE-1){
			  printf("%d * %d " ,a[i], b[i]);
		  }else{
	   printf("%d * %d + " ,a[i], b[i]);
		  }
   }
	  printf("\n" );

   printf("THE SUM IS : %d \n",Sum);

   free(a);
   free(b);
   free(c);

   cudaFree(d_a);
   cudaFree(d_b);
   cudaFree(d_c);

}

There are several problems with this code:

int outline  =  blockIdx.x * gridDim.x;

You are trying to create a grid-stride loop here. However the proper formulation is:

int outline  =  blockDim.x * gridDim.x;

Your original formulation will result in a value of 0 for outline for block 0 (since block 0 has blockIdx.x of 0), and would result in an infinite loop in your first while-loop. The reason you do not get a hang is that you are on windows and the WDDM TDR watchdog is kicking in, terminating your kernel.

while(i!=0){
		  if(cacheIdx<i){
			  cache[cacheIdx] += cache[cacheIdx + 1 ];
			  __syncthreads();

			  i/=2;

		  }

		  if(cacheIdx == 0 ){
			  c[blockIdx.x] = cache[0];
		  }

We have another infinite loop here. Since i is only reduced for threads that satisfy the if-condition, the other threads will spin in this loop forever. Coupled with that, you are using __syncthreads() in a conditional code, which is illegal. Next, the sweep pattern (cacheIdx +1) is not correct. This is accumulating adjacent elements. You want to accumulate elements in the upper half to the lower half (cacheIdx + i). Finally, although it is not legally broken, it’s not necessary to write the value to the c array on each loop iteration. We only need to write it once, at the end of the kernel. Therefore the above code should be re-written as follows:

while(i!=0){
		  if(cacheIdx<i){
			  cache[cacheIdx] += cache[cacheIdx + i ];}
	          __syncthreads();

		  i/=2;

		  }

	  if(cacheIdx == 0 ){
			  c[blockIdx.x] = cache[0];}
  1. In host code, you have a problem here:
for (int i = 0; i < SIZE; i++) {

     a[i] = rand() %10;
     b[i] = rand() %10;

     c[i] =0;
     
   }

You are initializing c[i] up to i < SIZE, but you have not allocated space for c according to SIZE. In this particular example, this code is writing past the end of c, and corrupting the stack. To fix this, we should have a separate loop to initialize c, according to its size:

for (int i = 0; i < SIZE; i++) {

     a[i] = rand() %10;
     b[i] = rand() %10;
     
   }
   for (int i = 0; i < blockperGrid; i++){
  
     c[i] =0;
   }

Here is my modified code, it seems to work better:

$ cat t1339.cu
#include <stdio.h>

#define SIZE 10
#define min(a,b)    (a<b?a:b)

const int threadperBlock = 256;
const int blockperGrid = min(32 , (SIZE+threadperBlock-1)/threadperBlock);

__global__ void  DotVector(int *a , int *b , int *c){
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        __shared__ int cache[threadperBlock];
          int outline  =  blockDim.x * gridDim.x;
          int cacheIdx = threadIdx.x;
          int temp = 0;
          while(tid<SIZE){

                  temp += a[tid] * b[tid];
                  tid += outline ;
                                          }

          cache[cacheIdx] = temp;
          __syncthreads();

          int i = blockDim.x/2;

          while(i!=0){
                  if(cacheIdx<i){
                          cache[cacheIdx] += cache[cacheIdx + i ];
                  }
                          __syncthreads();

                          i/=2;

if(cacheIdx == 0 ){
                          c[blockIdx.x] = cache[0];
                  }

          }

}

int main(){

   int Sum = 0 ;
   int *a , *b, *c ;
   int *d_a, *d_b, *d_c ;

   a=(int *)malloc(SIZE*sizeof(int));
   b=(int *)malloc(SIZE*sizeof(int));
   c=(int *)malloc(blockperGrid*sizeof(int));

cudaMalloc(&d_a,SIZE*sizeof(int));
   cudaMalloc(&d_b,SIZE*sizeof(int));
   cudaMalloc(&d_c,blockperGrid*sizeof(int));

   for (int i = 0; i < SIZE; i++) {

     a[i] = rand() %10;
     b[i] = rand() %10;

   }
   for (int i = 0; i < blockperGrid; i++) {
     c[i] = 0;
   }

   cudaMemcpy(d_a ,a , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
   cudaMemcpy(d_b ,b , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
   cudaMemcpy(d_c ,c , blockperGrid*sizeof(int) ,cudaMemcpyHostToDevice);

DotVector<<<blockperGrid,threadperBlock>>>(d_a,d_b,d_c);

cudaMemcpy(c,d_c,blockperGrid*sizeof(int),cudaMemcpyDeviceToHost);

for(int i = 0 ; i<blockperGrid; i++){
           printf(" %d \n" , c[i]);
           Sum += c[i];

   }
   printf("TAB__A[ ");
   for(int i = 0 ; i<SIZE; i++){

           printf(" %d ," , a[i]);

   }
   printf("] \n" );

   printf("TAB__B[ ");
      for(int i = 0 ; i<SIZE; i++){

           printf(" %d ," , b[i]);

   }
   printf("] \n" );

   printf("We Calculate : \t" );
      for(int i = 0 ; i<SIZE; i++){
                  if(i == SIZE-1){
                          printf("%d * %d " ,a[i], b[i]);
                  }else{
           printf("%d * %d + " ,a[i], b[i]);
                  }
   }
          printf("\n" );

   printf("THE SUM IS : %d \n",Sum);

   free(a);
   free(b);
   free(c);

   cudaFree(d_a);
   cudaFree(d_b);
   cudaFree(d_c);
}
$ nvcc t1339.cu -o t1339
$ cuda-memcheck ./t1339
========= CUDA-MEMCHECK
 133
TAB__A[  3 , 7 , 3 , 6 , 9 , 2 , 0 , 3 , 0 , 2 ,]
TAB__B[  6 , 5 , 5 , 2 , 1 , 7 , 9 , 6 , 6 , 6 ,]
We Calculate :  3 * 6 + 7 * 5 + 3 * 5 + 6 * 2 + 9 * 1 + 2 * 7 + 0 * 9 + 3 * 6 + 0 * 6 + 2 * 6
THE SUM IS : 133
========= ERROR SUMMARY: 0 errors
$

I dont know how to thank you mister txbob
but i which you the best you realy helped me
have a nice day sir !!