# Help DotVectors Cuda

hallo ive tried and created my own code for dotVectors in cuda lang and
after all always i get “0” as a result and this is my code
thanks in advance for helping me out
TAB__A[ 1 , 4 , 9 , 8 , 2 , 5 , 1 , 1 , 5 , 7 ,]
TAB__B[ 7 , 0 , 4 , 8 , 4 , 5 , 7 , 1 , 2 , 6 ,]
We Calculate : 1 * 7 + 4 * 0 + 9 * 4 + 8 * 8 + 2 * 4 + 5 * 5 + 1 * 7 + 1 * 1 +
5 * 2 + 7 * 6
THE SUM IS : 0

``````#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>

#define SIZE 10
#define min(a,b)    (a<b?a:b)

__global__ void  DotVector(int *a , int *b , int *c){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int outline  =  blockIdx.x * gridDim.x;
int temp = 0;
while(tid<SIZE){

temp += a[tid] * b[tid];
tid += outline ;
}

cache[cacheIdx] = temp;

int i = blockDim.x/2;

while(i!=0){
if(cacheIdx<i){
cache[cacheIdx] += cache[cacheIdx + 1 ];

i/=2;

}

if(cacheIdx == 0 ){
c[blockIdx.x] = cache[0];
}

}

}

int main(){

int Sum = 0 ;
int *a , *b, *c ;
int *d_a, *d_b, *d_c ;

a=(int *)malloc(SIZE*sizeof(int));
b=(int *)malloc(SIZE*sizeof(int));
c=(int *)malloc(blockperGrid*sizeof(int));

cudaMalloc(&d_a,SIZE*sizeof(int));
cudaMalloc(&d_b,SIZE*sizeof(int));
cudaMalloc(&d_c,blockperGrid*sizeof(int));

for (int i = 0; i < SIZE; i++) {

a[i] = rand() %10;
b[i] = rand() %10;

c[i] =0;

}

cudaMemcpy(d_a ,a , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
cudaMemcpy(d_b ,b , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
cudaMemcpy(d_c ,c , blockperGrid*sizeof(int) ,cudaMemcpyHostToDevice);

cudaMemcpy(c,d_c,blockperGrid*sizeof(int),cudaMemcpyDeviceToHost);

for(int i = 0 ; i<blockperGrid; i++){
printf(" %d \n" , c[i]);
Sum += c[i];

}
printf("TAB__A[ ");
for(int i = 0 ; i<SIZE; i++){

printf(" %d ," , a[i]);

}
printf("] \n" );

printf("TAB__B[ ");
for(int i = 0 ; i<SIZE; i++){

printf(" %d ," , b[i]);

}
printf("] \n" );

printf("We Calculate : \t" );
for(int i = 0 ; i<SIZE; i++){
if(i == SIZE-1){
printf("%d * %d " ,a[i], b[i]);
}else{
printf("%d * %d + " ,a[i], b[i]);
}
}
printf("\n" );

printf("THE SUM IS : %d \n",Sum);

free(a);
free(b);
free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

}
``````

There are several problems with this code:

``````int outline  =  blockIdx.x * gridDim.x;
``````

You are trying to create a grid-stride loop here. However the proper formulation is:

``````int outline  =  blockDim.x * gridDim.x;
``````

Your original formulation will result in a value of 0 for outline for block 0 (since block 0 has blockIdx.x of 0), and would result in an infinite loop in your first while-loop. The reason you do not get a hang is that you are on windows and the WDDM TDR watchdog is kicking in, terminating your kernel.

``````while(i!=0){
if(cacheIdx<i){
cache[cacheIdx] += cache[cacheIdx + 1 ];

i/=2;

}

if(cacheIdx == 0 ){
c[blockIdx.x] = cache[0];
}
``````

We have another infinite loop here. Since i is only reduced for threads that satisfy the if-condition, the other threads will spin in this loop forever. Coupled with that, you are using __syncthreads() in a conditional code, which is illegal. Next, the sweep pattern (cacheIdx +1) is not correct. This is accumulating adjacent elements. You want to accumulate elements in the upper half to the lower half (cacheIdx + i). Finally, although it is not legally broken, it’s not necessary to write the value to the c array on each loop iteration. We only need to write it once, at the end of the kernel. Therefore the above code should be re-written as follows:

``````while(i!=0){
if(cacheIdx<i){
cache[cacheIdx] += cache[cacheIdx + i ];}

i/=2;

}

if(cacheIdx == 0 ){
c[blockIdx.x] = cache[0];}
``````
1. In host code, you have a problem here:
``````for (int i = 0; i < SIZE; i++) {

a[i] = rand() %10;
b[i] = rand() %10;

c[i] =0;

}
``````

You are initializing c[i] up to i < SIZE, but you have not allocated space for c according to SIZE. In this particular example, this code is writing past the end of c, and corrupting the stack. To fix this, we should have a separate loop to initialize c, according to its size:

``````for (int i = 0; i < SIZE; i++) {

a[i] = rand() %10;
b[i] = rand() %10;

}
for (int i = 0; i < blockperGrid; i++){

c[i] =0;
}
``````

Here is my modified code, it seems to work better:

``````\$ cat t1339.cu
#include <stdio.h>

#define SIZE 10
#define min(a,b)    (a<b?a:b)

__global__ void  DotVector(int *a , int *b , int *c){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int outline  =  blockDim.x * gridDim.x;
int temp = 0;
while(tid<SIZE){

temp += a[tid] * b[tid];
tid += outline ;
}

cache[cacheIdx] = temp;

int i = blockDim.x/2;

while(i!=0){
if(cacheIdx<i){
cache[cacheIdx] += cache[cacheIdx + i ];
}

i/=2;

if(cacheIdx == 0 ){
c[blockIdx.x] = cache[0];
}

}

}

int main(){

int Sum = 0 ;
int *a , *b, *c ;
int *d_a, *d_b, *d_c ;

a=(int *)malloc(SIZE*sizeof(int));
b=(int *)malloc(SIZE*sizeof(int));
c=(int *)malloc(blockperGrid*sizeof(int));

cudaMalloc(&d_a,SIZE*sizeof(int));
cudaMalloc(&d_b,SIZE*sizeof(int));
cudaMalloc(&d_c,blockperGrid*sizeof(int));

for (int i = 0; i < SIZE; i++) {

a[i] = rand() %10;
b[i] = rand() %10;

}
for (int i = 0; i < blockperGrid; i++) {
c[i] = 0;
}

cudaMemcpy(d_a ,a , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
cudaMemcpy(d_b ,b , SIZE*sizeof(int) ,cudaMemcpyHostToDevice);
cudaMemcpy(d_c ,c , blockperGrid*sizeof(int) ,cudaMemcpyHostToDevice);

cudaMemcpy(c,d_c,blockperGrid*sizeof(int),cudaMemcpyDeviceToHost);

for(int i = 0 ; i<blockperGrid; i++){
printf(" %d \n" , c[i]);
Sum += c[i];

}
printf("TAB__A[ ");
for(int i = 0 ; i<SIZE; i++){

printf(" %d ," , a[i]);

}
printf("] \n" );

printf("TAB__B[ ");
for(int i = 0 ; i<SIZE; i++){

printf(" %d ," , b[i]);

}
printf("] \n" );

printf("We Calculate : \t" );
for(int i = 0 ; i<SIZE; i++){
if(i == SIZE-1){
printf("%d * %d " ,a[i], b[i]);
}else{
printf("%d * %d + " ,a[i], b[i]);
}
}
printf("\n" );

printf("THE SUM IS : %d \n",Sum);

free(a);
free(b);
free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
\$ nvcc t1339.cu -o t1339
\$ cuda-memcheck ./t1339
========= CUDA-MEMCHECK
133
TAB__A[  3 , 7 , 3 , 6 , 9 , 2 , 0 , 3 , 0 , 2 ,]
TAB__B[  6 , 5 , 5 , 2 , 1 , 7 , 9 , 6 , 6 , 6 ,]
We Calculate :  3 * 6 + 7 * 5 + 3 * 5 + 6 * 2 + 9 * 1 + 2 * 7 + 0 * 9 + 3 * 6 + 0 * 6 + 2 * 6
THE SUM IS : 133
========= ERROR SUMMARY: 0 errors
\$
``````

I dont know how to thank you mister txbob
but i which you the best you realy helped me
have a nice day sir !!