Hi, I am beginner coder in CUDA. I’d like to find max in random vector. The program sometimes show good result but sometimes wrong. I read that I should use a reduction but how? Any simples examples? It’s my code:
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#define SIZE 5
__global__ void max(int *a , int *c)
{
int i = threadIdx.x;
*c = a[0];
if(a[i] > *c)
{
*c = a[i];
}
}
int main()
{
int i;
srand(time(NULL));
int a;
int c;
int *dev_a, *dev_c;
cudaMalloc((void **) &dev_a, SIZE*sizeof(int));
cudaMalloc((void **) &dev_c, SIZE*sizeof(int));
for( i = 0 ; i < SIZE ; i++)
{
a[i] = rand()% 20 + 1;
}
for( i = 0 ; i < SIZE ; i++)
{
printf("%d ",a[i]);
}
cudaMemcpy(dev_a , a, SIZE*sizeof(int),cudaMemcpyHostToDevice);
max<<<1,SIZE>>>(dev_a,dev_c);
cudaMemcpy(&c, dev_c, SIZE*sizeof(int),cudaMemcpyDeviceToHost);
printf("
max = %d
",c);
cudaFree(dev_a);
cudaFree(dev_c);
printf("
");
return 0;
}
If you’re in it for educational purposes, then I’d advise looking at the whitepaper for “CUDA Parallel Reduction” – you can find it at CUDA Samples :: CUDA Toolkit Documentation. It gives a very good overview of reductions and how to implement them in CUDA
I added a reduction for calculate the sum, but the program return wrong sum of array. Only for SIZE of array equal two return good result. I don’t know what is it. Help:(
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#define SIZE 3
__global__ void max(int *a , int *c )
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = a[i];
__syncthreads();
///////////////////////////
for(unsigned int s=1; s<blockDim.x; s*=2)
{
int index = 2 * s * tid;
if(index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
//////////////////////////////
__syncthreads();
}
if(tid == 0) c[blockIdx.x] = sdata[0];
}
//////////////////////////
int main()
{
int i;
srand(time(NULL));
int *a;
a = (int*)malloc(SIZE * sizeof(int));
int c;
int *dev_a, *dev_c;
cudaMalloc((void **) &dev_a, SIZE*sizeof(int));
cudaMalloc((void **) &dev_c, SIZE*sizeof(int));
for( i = 0 ; i < SIZE ; i++)
{
a[i] = rand()% 20 + 1;
}
for( i = 0 ; i < SIZE ; i++)
{
printf("%d ",a[i]);
}
cudaMemcpy(dev_a , a, SIZE*sizeof(int),cudaMemcpyHostToDevice);
max<<<1,SIZE>>>(dev_a,dev_c);
cudaMemcpy(&c, dev_c, SIZE*sizeof(int),cudaMemcpyDeviceToHost);
printf("
sum = %d
",c);
cudaFree(dev_a);
cudaFree(dev_c);
printf("
");
return 0;
}
This is Cuda and you will probably want something that will work on far larger arrays at some time.
The problem can be divided into 3 stages
each of N threads finds a thread_maximum from SIZE/N elements of the array
in each block find a block_maximum from the thread_maximum’s (for threads in that block)
find an overall_maximum from the block_maximum’s
Assuming that the SIZE is about 10 million and for a start try 256 blocks and 256 threads per block, so N = 256*256. ( can test different numbers once its working)
SIZE/N in this case would be about 160
So now in stage 1) each thread is finding a thread_maximum from about 160 values, this can be done just using simple variable ( this will be in a register and as its only visible to the thread no synchthreads is needed )
You wont need to copy the values from a into a shared array just into a local variable.
Stage 2 is to find a block_maximum from the 256 threads in the block, to do this each thread needs to save its thread_maximum at end of stage 1 into a shared array, do a synchthreads, then find the block_maximum from this array.
This block_maximum needs to be written to your global array c and only once they have all been written to c ( i.e. all blocks finished) can the overall_maximum be found.
However there is no guarantee about the order that blocks run in, the only way I know of, that is guaranteed to ensure that all blocks have finished, is to wait until the entire kernel finishes and exits.
So to find the overall_maximum from the block_maximums in c use a second kernel call to the same kernel but using only 1 block and c as the input array, d as the output, d[0] should have final result.
In stage 1 you can easily make sure that reads are contiguous i.e. adjacent threads in a block read adjacent cells of a
Also pass in the number of elements in input array as a third parameter to the kernel.
I can not understand why your code does not work, but the code I posted works for me and you just have to replace the sum operation with min (max) operation. Maybe try to separate this if in two ifs:
__global__ void max(int *a , int *c )
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = a[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s>=1; s=s/2)
{
if(tid< s)
{
if(s[tid]>sdata[tid + s])
{sdata[tid] = sdata[tid + s];}
}
//////////////////////////////
__syncthreads();
}
if(tid == 0) c[blockIdx.x] = sdata[0];
}
This should return the min of the data on one block without problems.
Thanks for help but it’s still the same, for size >= 10 min return 0, and for max return sometimes good result sometimes wrong, I don’t know why it’s not working the same is to calculate the sum of array. I do it by ssh because I don’t have NVIDIA graphic card.
– the shared memory can be declared in 2 ways static and dynamic. In this code the shared memory is defined external, which means that you need to indicate the memory in the kernel code
– variable d you defined as int d; , but you atr trying to copy the dev_d array which has more variables.
I modified you code for the case of array with 512 elements and it will work also for any power of 2 smaller than 512. for nblocks > 1 you get more values returned in the d array which you need to do it on cpu or gpu dpeending on size of nblocks. Here is my code:
#include < cuda.h >
#include < stdio.h >
#include < time.h >
#define tbp 512
#define nblocks 1
__global__ void kernel_min(int *a, int *d)
{
__shared__ int sdata[tbp]; //"static" shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = a[i];
__syncthreads();
for(unsigned int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] > sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
}
__syncthreads();
}
if(tid == 0 )
{
d[blockIdx.x] = sdata[0];
}
}
int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));
int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));
int *dev_a, *dev_d;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
int mmm=100;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
//printf("%d
",a[i]);
if(mmm>a[i]) mmm=a[i];
}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
kernel_min < < < nblocks,tbp > > >(dev_a,dev_d);
cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
printf("cpu min %d, gpu_min = %d
",mmm,d[0]);
cudaFree(dev_a);
cudaFree(dev_d);
printf("");
return 0;
}