Parallelize function which will count all vectors with sum equal of vector elements and elements not

I want to parallelize a function in CUDA C which will count all vectors with sum equal of vector elements and elements not bigger than k. For example if the number of vector elements n is 5, sum=10 and k=3 than, the number of vectors who satisfy this condition is 101. I’ve already made this function in CUDA C but the problem is when the number of blocks and threads are bigger than 1. I know that the problem is in cycles and I should change it but I don’t know from where to start. When I am calling the function with blocks and threads equal to one than the function is execute serial and everything is good but in this case the function is not parallelized.

The source code of the program is:

#include <stdio.h>
#include<stdlib.h>
#include<assert.h>
#include<cuda.h>

//function that count number of vectors
__device__ void count(int *vector, int *total, int n, int s)
{
   int i,sum=0;
   for(i=blockIdx.x*blockDim.x+threadIdx.x;i<n;i+=blockDim.x*gridDim.x)
   { 
     
     sum+=vector[i];
	 __syncthreads();
   }
   if(sum==s)
   {
     
     total[0]=total[0]+1;
   }
}
 
//main function
__global__ void computeVectors(int *vector, int n, int kk, int s, int *total)
{
   int k=0;
   int j,i,next;
   
   while(1)
   {
     //this is the problem, in for cycle
     for(j=blockIdx.x*blockDim.x+threadIdx.x; j<=kk; j+=blockDim.x*gridDim.x)
     {
       vector[k]=j;
       count(vector, total, n, s);
	   __syncthreads();
     }
     for(i=blockIdx.x*blockDim.x+threadIdx.x; i<n; i+=blockDim.x*gridDim.x)
     {
	   
       if(vector[i]<kk)
	      break;
     }	 
     next=i;
     vector[next]++;
     for(i=blockIdx.x*blockDim.x+threadIdx.x; i<sledno; i+=blockDim.x*gridDim.x)
     {
       vector[i]=0;
	   __syncthreads();
     }
     k=0;
     if(next>=n)
	    break;
   }
}
 
int main(){
 
    cudaError_t err = cudaSuccess;
	
    int n,k,sum;
    int counter=0;
	
    printf("Enter the length of vector n=");
    scanf("%d",&n);
    printf("Enter the max value of vector elements k=");
    scanf("%d",&k);
    printf("Enter the sum of vector elements sum=");
    scanf("%d",&sum);
 
   //initial vector with length n
     int *vec_h, *vec_d;
     size_t sizevec=n*sizeof(int);
     vec_h=(int *)malloc(sizevec);
     cudaMalloc((void **) &vec_d, sizevec);
	
	for(counter=0; counter<n; counter++)
		{
			vec_h[counter]=0;
		}
	cudaMemcpy(vec_d, vec_h, sizevec, cudaMemcpyHostToDevice);
	
    int *total_h, *total_d;
    size_t size=1*sizeof(int);
    total_h=(int *)malloc(size);
    cudaMalloc((void **) &total_d, size);
    total_h[0]=0;
    cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
	 
   //calling of main function
    computeVectors<<<1, 1>>>(vec_d, n, k, sum, total_d);
 
    cudaThreadSynchronize(); 
  
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Error: %s!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    cudaMemcpy(total_h, total_d, size, cudaMemcpyDeviceToHost);
    printf("Number of vectors that satisfy condition is %d\n", total_h[0]);
	
	
    free(vec_h); 
    cudaFree(vec_d);
	
    free(total_h); 
    cudaFree(total_d);
 
    return 0;
}

Hello,

I think one of the problems is at this line:
total[0]=total[0]+1;
maybe you need to use atomic operations like atomicAdd(&total,1);

also keep in mind that this:
__syncthreads();
only syncs inside a block. Also e careful with the sync command, you must be sure that all threads in the block execute it, otherwise it crashes.
But first start with 1 block and more threads. At least in the count function I suggest you get rid of the sync command and use the atomic add for the counter.

//function that count number of vectors
__device__ void count(int *vector, int *total, int n, int s)
{
   int i,sum=0;
   for(i=blockIdx.x*blockDim.x+threadIdx.x;i<n;i+=blockDim.x*gridDim.x)
   { 
     
     sum+=vector[i];
   }
   if(sum==s)
   {     
     atomicAdd( & total[0],1);
   }
}