Parallelize function which will count all vectors with sum equal of vector elements and elements not

zlristovski · October 18, 2013, 7:04pm

I want to parallelize a function in CUDA C which will count all vectors with sum equal of vector elements and elements not bigger than k. For example if the number of vector elements n is 5, sum=10 and k=3 than, the number of vectors who satisfy this condition is 101. I’ve already made this function in CUDA C but the problem is when the number of blocks and threads are bigger than 1. I know that the problem is in cycles and I should change it but I don’t know from where to start. When I am calling the function with blocks and threads equal to one than the function is execute serial and everything is good but in this case the function is not parallelized.

The source code of the program is:

#include <stdio.h>
#include<stdlib.h>
#include<assert.h>
#include<cuda.h>

//function that count number of vectors
__device__ void count(int *vector, int *total, int n, int s)
{
   int i,sum=0;
   for(i=blockIdx.x*blockDim.x+threadIdx.x;i<n;i+=blockDim.x*gridDim.x)
   { 
     
     sum+=vector[i];
	 __syncthreads();
   }
   if(sum==s)
   {
     
     total[0]=total[0]+1;
   }
}
 
//main function
__global__ void computeVectors(int *vector, int n, int kk, int s, int *total)
{
   int k=0;
   int j,i,next;
   
   while(1)
   {
     //this is the problem, in for cycle
     for(j=blockIdx.x*blockDim.x+threadIdx.x; j<=kk; j+=blockDim.x*gridDim.x)
     {
       vector[k]=j;
       count(vector, total, n, s);
	   __syncthreads();
     }
     for(i=blockIdx.x*blockDim.x+threadIdx.x; i<n; i+=blockDim.x*gridDim.x)
     {
	   
       if(vector[i]<kk)
	      break;
     }	 
     next=i;
     vector[next]++;
     for(i=blockIdx.x*blockDim.x+threadIdx.x; i<sledno; i+=blockDim.x*gridDim.x)
     {
       vector[i]=0;
	   __syncthreads();
     }
     k=0;
     if(next>=n)
	    break;
   }
}
 
int main(){
 
    cudaError_t err = cudaSuccess;
	
    int n,k,sum;
    int counter=0;
	
    printf("Enter the length of vector n=");
    scanf("%d",&n);
    printf("Enter the max value of vector elements k=");
    scanf("%d",&k);
    printf("Enter the sum of vector elements sum=");
    scanf("%d",&sum);
 
   //initial vector with length n
     int *vec_h, *vec_d;
     size_t sizevec=n*sizeof(int);
     vec_h=(int *)malloc(sizevec);
     cudaMalloc((void **) &vec_d, sizevec);
	
	for(counter=0; counter<n; counter++)
		{
			vec_h[counter]=0;
		}
	cudaMemcpy(vec_d, vec_h, sizevec, cudaMemcpyHostToDevice);
	
    int *total_h, *total_d;
    size_t size=1*sizeof(int);
    total_h=(int *)malloc(size);
    cudaMalloc((void **) &total_d, size);
    total_h[0]=0;
    cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
	 
   //calling of main function
    computeVectors<<<1, 1>>>(vec_d, n, k, sum, total_d);
 
    cudaThreadSynchronize(); 
  
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Error: %s!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    cudaMemcpy(total_h, total_d, size, cudaMemcpyDeviceToHost);
    printf("Number of vectors that satisfy condition is %d\n", total_h[0]);
	
	
    free(vec_h); 
    cudaFree(vec_d);
	
    free(total_h); 
    cudaFree(total_d);
 
    return 0;
}

pasoleatis · October 19, 2013, 11:00am

Hello,

I think one of the problems is at this line:
total[0]=total[0]+1;
maybe you need to use atomic operations like atomicAdd(&total,1);

also keep in mind that this:
__syncthreads();
only syncs inside a block. Also e careful with the sync command, you must be sure that all threads in the block execute it, otherwise it crashes.
But first start with 1 block and more threads. At least in the count function I suggest you get rid of the sync command and use the atomic add for the counter.

//function that count number of vectors
__device__ void count(int *vector, int *total, int n, int s)
{
   int i,sum=0;
   for(i=blockIdx.x*blockDim.x+threadIdx.x;i<n;i+=blockDim.x*gridDim.x)
   { 
     
     sum+=vector[i];
   }
   if(sum==s)
   {     
     atomicAdd( & total[0],1);
   }
}

Topic		Replies	Views
Sum more of 33.553.920 numbers (max numbers of thread on Tesla S2050) CUDA Programming and Performance	2	5882	December 15, 2011
How to count the element in block CUDA Programming and Performance	5	1288	July 8, 2011
Problem with Vectors add Can't compute sum of two vectors CUDA Programming and Performance	4	1662	March 16, 2009
finding sum CUDA Programming and Performance	1	2520	November 18, 2007
CUDA programming - Help CUDA Programming and Performance	0	2807	January 29, 2009
Help! Sum of vectors CUDA Programming and Performance	7	918	June 16, 2011
sequential sum within a kernel. CUDA Programming and Performance	23	5075	September 8, 2008
Need help with summing results from different blocks CUDA Programming and Performance	3	2566	May 10, 2010
Summing array elements using kernel Access frome the whole block grid CUDA Programming and Performance	3	879	July 16, 2010
Sum of a subvector CUDA Programming and Performance	7	1896	June 17, 2009

Parallelize function which will count all vectors with sum equal of vector elements and elements not

Related topics