Finding max in array

Hi, I am beginner coder in CUDA. I’d like to find max in random vector. The program sometimes show good result but sometimes wrong. I read that I should use a reduction but how? Any simples examples? It’s my code:

#include <cuda.h>
#include <stdio.h>
#include <time.h>

#define SIZE 5

__global__ void max(int *a , int *c)
{
int i = threadIdx.x;

*c = a[0];

        if(a[i] > *c)
                {
                *c = a[i];
                }

}

int main()
{
int i;
srand(time(NULL));

int a;
int c;

int *dev_a, *dev_c;

cudaMalloc((void **) &dev_a, SIZE*sizeof(int));
cudaMalloc((void **) &dev_c, SIZE*sizeof(int));

for( i = 0 ; i < SIZE ; i++)
{
a[i] = rand()% 20 + 1;
}
for( i = 0 ; i < SIZE ; i++)
{
printf("%d ",a[i]);
}

cudaMemcpy(dev_a , a, SIZE*sizeof(int),cudaMemcpyHostToDevice);
max<<<1,SIZE>>>(dev_a,dev_c);
cudaMemcpy(&c, dev_c, SIZE*sizeof(int),cudaMemcpyDeviceToHost);

printf("
 max =  %d 
",c);

cudaFree(dev_a);
cudaFree(dev_c);
printf("
");

return 0;
}

Thanks in advance.

Your kernel has a race condition, as every thread will be evaluating the conditional simultaneously. If you get the correct answer, it’s by luck.

You’re correct in that you should use a reduction, but they can be difficult to implement correctly. If you just want to get it working as part of a larger project, then I’d advise using Thrust’s max_element function; documentation is at http://docs.thrust.googlecode.com/hg/group__extrema.html#gabc0550e56644df298a6b7a5824f1a166

If you’re in it for educational purposes, then I’d advise looking at the whitepaper for “CUDA Parallel Reduction” – you can find it at docs.nvidia.com/cuda/cuda-samples/index.html. It gives a very good overview of reductions and how to implement them in CUDA

Here is a link to some fast reduction codes which have been used for both reduction sum and finding min and max:

https://devtalk.nvidia.com/default/topic/474029/cuda-programming-and-performance/fastest-approach-to-implementing-min-max-using-cuda/

I added a reduction for calculate the sum, but the program return wrong sum of array. Only for SIZE of array equal two return good result. I don’t know what is it. Help:(

#include <cuda.h>
#include <stdio.h>
#include <time.h>

#define SIZE 3

__global__ void max(int *a , int *c )
{
extern __shared__ int sdata[];

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

sdata[tid] = a[i];

__syncthreads();

///////////////////////////
for(unsigned int s=1; s<blockDim.x; s*=2)
{
int index = 2 * s * tid;
if(index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
//////////////////////////////
__syncthreads();
}
if(tid == 0) c[blockIdx.x] = sdata[0];

}
//////////////////////////
int main()
{
int i;
srand(time(NULL));

int *a;
a = (int*)malloc(SIZE * sizeof(int));
int c;

int *dev_a, *dev_c;

cudaMalloc((void **) &dev_a, SIZE*sizeof(int));
cudaMalloc((void **) &dev_c, SIZE*sizeof(int));

for( i = 0 ; i < SIZE ; i++)
{
a[i] = rand()% 20 + 1;
}
for( i = 0 ; i < SIZE ; i++)
{
printf("%d ",a[i]);
}

cudaMemcpy(dev_a , a, SIZE*sizeof(int),cudaMemcpyHostToDevice);
max<<<1,SIZE>>>(dev_a,dev_c);
cudaMemcpy(&c, dev_c, SIZE*sizeof(int),cudaMemcpyDeviceToHost);

printf("
sum =  %d 
",c);

cudaFree(dev_a);
cudaFree(dev_c);

printf("
");

return 0;
}

refresh

Maybe index + s can be out of blocksize, try to debug your code and check index errors.

An alternative to your code:

__global__ void max(int *a , int *c )
{
extern __shared__ int sdata[];

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

sdata[tid] = a[i]; 

__syncthreads();
for(unsigned int s=blockDim.x/2; s>=1; s=s/2)
{
if(tid< s)
{
sdata[tid] += sdata[tid + s];
}
//////////////////////////////
__syncthreads();
}
if(tid == 0) c[blockIdx.x] = sdata[0];
}

the same, for SIZE > 8 return wrong number of sum, for searching min and max it’s the same not find max,min only for SIZE <= 8 return good result

__global__ void max(int *a, int *c)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = a[i];

__syncthreads();

for(unsigned int s=blockDim.x/2 ; s > 0 ; s>>=1)
{
if(tid < s && sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
__syncthreads();
}

if(tid == 0 ) c[blockIdx.x] = sdata[0];
}

Hi,

This is Cuda and you will probably want something that will work on far larger arrays at some time.
The problem can be divided into 3 stages

  1. each of N threads finds a thread_maximum from SIZE/N elements of the array
  2. in each block find a block_maximum from the thread_maximum’s (for threads in that block)
  3. find an overall_maximum from the block_maximum’s

Assuming that the SIZE is about 10 million and for a start try 256 blocks and 256 threads per block, so N = 256*256. ( can test different numbers once its working)
SIZE/N in this case would be about 160

So now in stage 1) each thread is finding a thread_maximum from about 160 values, this can be done just using simple variable ( this will be in a register and as its only visible to the thread no synchthreads is needed )
You wont need to copy the values from a into a shared array just into a local variable.

Stage 2 is to find a block_maximum from the 256 threads in the block, to do this each thread needs to save its thread_maximum at end of stage 1 into a shared array, do a synchthreads, then find the block_maximum from this array.

This block_maximum needs to be written to your global array c and only once they have all been written to c ( i.e. all blocks finished) can the overall_maximum be found.
However there is no guarantee about the order that blocks run in, the only way I know of, that is guaranteed to ensure that all blocks have finished, is to wait until the entire kernel finishes and exits.
So to find the overall_maximum from the block_maximums in c use a second kernel call to the same kernel but using only 1 block and c as the input array, d as the output, d[0] should have final result.

In stage 1 you can easily make sure that reads are contiguous i.e. adjacent threads in a block read adjacent cells of a

Also pass in the number of elements in input array as a third parameter to the kernel.

Hello,

I can not understand why your code does not work, but the code I posted works for me and you just have to replace the sum operation with min (max) operation. Maybe try to separate this if in two ifs:

__global__ void max(int *a , int *c )
    {
    extern __shared__ int sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[tid] = a[i];

    __syncthreads();
    for(unsigned int s=blockDim.x/2; s>=1; s=s/2)
    {
    if(tid< s)
    {
    if(s[tid]>sdata[tid + s])
    {sdata[tid] = sdata[tid + s];}
    }
    //////////////////////////////
    __syncthreads();
    }
    if(tid == 0) c[blockIdx.x] = sdata[0];
    }

This should return the min of the data on one block without problems.

Thanks for help but it’s still the same, for size >= 10 min return 0, and for max return sometimes good result sometimes wrong, I don’t know why it’s not working the same is to calculate the sum of array. I do it by ssh because I don’t have NVIDIA graphic card.

Post more of your code.

The whole code:

#include <cuda.h>
#include <stdio.h>
#include <time.h>

#define SIZE 30
__global__ void min(int *a, int *d)
{
extern __shared__ int sdata[];

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

sdata[tid] = a[i];

__syncthreads();
for(unsigned int s=blockDim.x/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] > sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
}
__syncthreads();
}
if(tid == 0 ) d[blockIdx.x] = sdata[0];
}
////////////////////////////////////////////
__global__ void max(int *a, int *c)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = a[i];

__syncthreads();

for(unsigned int s=blockDim.x/2 ; s > 0 ; s>>=1)
{
if(tid < s && sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
__syncthreads();
}

if(tid == 0 ) c[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////

int main()
{
int i;
srand(time(NULL));

int *a;
a = (int*)malloc(SIZE * sizeof(int));
int d;
//int c;

int *dev_a, *dev_c,*dev_d;

cudaMalloc((void **) &dev_a, SIZE*sizeof(int));
cudaMalloc((void **) &dev_c, SIZE*sizeof(int));
cudaMalloc((void **) &dev_d, SIZE*sizeof(int));

for( i = 0 ; i < SIZE ; i++)
{
a[i] = rand()% 100 + 1;

}
for( i = 0 ; i < SIZE ; i++)
{
printf("%d ",a[i]);

}
printf("
");

cudaMemcpy(dev_a , a, SIZE*sizeof(int),cudaMemcpyHostToDevice);
//max(dev_a,dev_c);
min(dev_a,dev_d);

//cudaMemcpy(&c, dev_c, SIZE*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(&d, dev_d, SIZE*sizeof(int),cudaMemcpyDeviceToHost);

//printf("
max  =  %d 
",c);
printf("
min  =  %d 
",d);

cudaFree(dev_a);
//cudaFree(dev_c);
cudaFree(dev_d);

printf("
");

return 0;
}

Hello,

I can see some several problems:

– the shared memory can be declared in 2 ways static and dynamic. In this code the shared memory is defined external, which means that you need to indicate the memory in the kernel code
– variable d you defined as int d; , but you atr trying to copy the dev_d array which has more variables.

I modified you code for the case of array with 512 elements and it will work also for any power of 2 smaller than 512. for nblocks > 1 you get more values returned in the d array which you need to do it on cpu or gpu dpeending on size of nblocks. Here is my code:

#include < cuda.h >
#include < stdio.h >
#include < time.h >

#define tbp 512
#define nblocks 1
__global__ void kernel_min(int *a, int *d)
{
__shared__ int sdata[tbp]; //"static" shared memory

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

sdata[tid] = a[i];

__syncthreads();
for(unsigned int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] > sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
}
__syncthreads();
}
if(tid == 0 ) 
{
d[blockIdx.x] = sdata[0];
}
}

int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));

int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));

int *dev_a, *dev_d;

cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
int mmm=100;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
//printf("%d 
",a[i]);
if(mmm>a[i]) mmm=a[i];

}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);

kernel_min < < < nblocks,tbp > > >(dev_a,dev_d);

cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);

printf("cpu min %d, gpu_min = %d
",mmm,d[0]);

cudaFree(dev_a);
cudaFree(dev_d);

printf("");

return 0;
}

Hello,

I need to search in an array with cuda in a parallel way, how can i do it?

Please help.

Hello,

I need to search in an array with cuda in a parallel way, how can i do it?

Please help.