Finding minimum among multiple threads

Hello guys!

I was wondering how to find the global minimum in a kernel when each thread has it’s own value. Let us look at a rather meaningless example

float min = 0;
float MAD = 0;

int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;

if(idx < n && idy < m)
{
    for(float g=0; g<N; g++)
    {
        float fval = tex2D (tex, idx+(g-2)+0.5f, idy+(g-2)+0.5f);
        MAD += fval; 
    }
    min=MAD;
    pos[0]=idx;
    pos[1]=idy;
		
}

And the kernel is launched

dim3 dimBlock( 16,16 );
dim3 dimGrid;
dimGrid.x = (n + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (m + dimBlock.y - 1)/dimBlock.y;

kernel <<< dimGrid,dimBlock >>> (...);

The way I see it, every thread will now have it’s own min value. In order to find the global minimum, the program needs to compare the min value in each thread to find the global minimum. The way I would like to achieve this, is to use shared memory within each block and then use atomic function to find the minimum among the blocks. I’ve seen this topic, how to find the min value - CUDA Programming and Performance - NVIDIA Developer Forums, but I need som more help. Can anyone show my have to save each value to shared memory, so I can use the code proposed in the link above?

Best regards
Sondre

I think that example you’re looking at is already complete.

He writes to global memory with what he claims,

atomicMin(&global_min, array[0]);

which seems appropriate. Also, he is successfully using the 0th index of every block, hence the

for (int i = blockIdx.x/2...)

as i goes backwards in his loop. Or at least, it should be. His syntax seems a little off.

I’m not sure how atomicMin works but have you tried doing it the way he does yet?

I’ve tried a bit, but didn’t managed to get it working. In the example he write “store results to a shared array”. In my code, I don’t write any result to a shared array. I modified my code in an attempt to get it working

__shared__ float min [10000];
    float MAD=0;

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;

    if(idx < n && idy < m)
    {
        for(float g=0; g<N; g++)
        {
            float fval = tex2D (tex, idx+(g-2)+0.5f, idy+(g-2)+0.5f);
            MAD += fval; 
        }
        min[idx]=MAD;
        pos[0]=idx;
        pos[1]=idy;	
    }
	
    __syncthreads();

    for(int i = blockDim.x / 2; i; i >>= 1)
    {
    if(threadIdx.x < i && min[threadIdx.x] < min[threadIdx.x + i]) 
        min[threadIdx.x] = min[threadIdx.x + i];
        
        __syncthreads();
    }

    if(threadIdx.x == 0)
    {
        atomicMin(&global_min, (int)min[0]);
		
    }

When I try to compile, I get the following error: “error : no instance of overloaded function “atomicMin” matches the argument list”

Is your global_min an int? I could only get atomicMin to work when its arguments were (int*, int).

I changed both the min and global_min to int, and it now compiles :) But it still produce the wrong result. I initialized the global_min as 1000000 and I know that the MAD value will be less. Nevertheless the atomicmin returns 1000000

Maybe it would be easier for you to see the program I’m working on

I’m currently working on porting a TERCOM algorithm from using only 1 thread to use multiple threads. Briefly explained , the TERCOM algorithm receives 5 measurements and the heading, and compare this measurements to a prestored map. The algorithm will choose the best match, i.e. lowest Mean Absolute Difference (MAD), and return the position.

The code is working perfectly with one thread and for-loops, but when I try to use multiple threads and blocks it returns the wrong answer.

Here are the code using for loops

__global__ void kernel (int m, int n, int h, int N, float *f, float heading, float *measurements) 
{
    //Without threads
    float pos[2]={0};
    float theta=heading*(PI/180);
    float MAD=0;

    // Calculate how much to move in x and y direction
    float offset_x = h*cos(theta);
    float offset_y = -h*sin(theta); 

    float min=100000; //Some High value

    //Calculate Mean Absolute Difference
    for(float row=0;row<m;row++)
    {
        for(float col=0;col<n;col++)
        {
            for(float g=0; g<N; g++)
            {
                f[(int)g] = tex2D (tex, col+(g-2)*offset_x+0.5f, row+(g-2)*offset_y+0.5f);
                MAD += abs(measurements[(int)g]-f[(int)g]);
            }
            if(MAD<min) 
            {
                min=MAD;
                pos[0]=col;
                pos[1]=row;
            }
            MAD=0;                  //Reset MAD
        }
    }

    f[0]=min;
    f[1]=pos[0];
    f[2]=pos[1];
}

And this is my attempt to use multiple threads

__global__ void kernel (int m, int n, int h, int N, float *f, float heading, float *measurements) 
{
    __shared__ int min [10000];
    int global_min=1000000;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;
    float pos[2]={0};
    float theta=heading*(PI/180);
    float MAD=0;

    // Calculate how much to move in x and y direction
    float offset_x = h*cos(theta);
    float offset_y = -h*sin(theta); 

    if(idx < n && idy < m)
    {
        for(float g=0; g<N; g++)
        {
            f[(int)g] = tex2D (tex, idx+(g-2)*offset_x+0.5f, idy+(g-2)*offset_y+0.5f);
            MAD += abs(measurements[(int)g]-f[(int)g]); 
        }

        min[idx]=MAD;
        pos[0]=idx;
        pos[1]=idy;    
    }
    f[0]=min;
    f[1]=pos[0];
    f[2]=pos[1];

    __syncthreads();

    for(int i = blockDim.x / 2; i; i >>= 1)
    {
        if(threadIdx.x < i && min[threadIdx.x] < min[threadIdx.x + i]) 
            min[threadIdx.x] = min[threadIdx.x + i];
        __syncthreads();
    }

    if(threadIdx.x == 0)
    {
        atomicMin(&global_min, min[0]);
    }

    f[0]=global_min;
}

No matter what I do, it will always return the value I initialized (int global_min=1000000;)

I have code that works but it’s subject to race conditions so Idk… I’d have to work on this a lot more but I’ma go visit my gf for the weekend so hopefully someone will come in and help you. Otherwise, don’t give up!

Okay, so I don’t actually know how the atomicMin() or atomicMax() functions work but I think I know the basic algorithm that you can try to implement yourself should all else fail.

So, I don’t think there’s any way to do it with just one kernel call. Or at least, not on architectures that don’t support recursive programs which is anything less that sm_35.

You need to find a way to get the minimum value per block, using the threadIdx.x for the location of the value within that block. Store the values in a smaller array and then search again, storing the results in a smaller array still.

For example, if we had a 16 element array. The quickest way to do this would probably be to break it up into 8 blocks of two threads each. Find the smaller of the pair. Now we have an 8 element array. Break this into 4 blocks of two threads each. Now we have 4 values to sort. Break this into 2 blocks of 2 and find the values and now all we’re left with is 2 values so it’s easy to find the minimum.

I think you can re-use the same addresses too because to make this work, I think you have to constantly write back and forth between the host and the device. That means this probably isn’t particularly good code or that its implementation will be breaking any speed records but I can’t seem to imagine a global sort working with any sort of different logic.

No matter what, you just want to break it down so that you’re comparing two elements at a time only. So you do your min sort, get the number of elements divided by 2 returned, rebuild your arrays and then make a fresh kernel call.

It’s so lame that GPU computing doesn’t really support recursion yet… But alas…

Thank you very much for the reply, MutantJohn. The method you described make sence, so I will try to implement it :) Even though it is not the fastest code, do you think it would be significantly faster than the single thread implementation?

I’ve been googling for dear life and tried to implement many different solutions, but nothing works. In most of the examples I’ve seen, the values are already stored in an array.

In my case each thread has its own value and I haven’t been able to access a spesific thread. The only thing I manage is the print out all the values.

So my question for you:
How can I store the value in each of the individual threads in one array?

The variable “global_min” should be an argument of your kernel, not a variable within your kernel. You should allocate it with cudaMalloc() and set its default value with cudaMemset().

This should work:
global void kernel (int m, int n, int h, int N, float *f, float heading, float *measurements, int *global_min)
{

atomicMin(global_min, min[0]);
}

This only stores the value of the minimum, do you also want to store the minimum’s location?

You can also take a look at the CUDA parallel reduction example: http://docs.nvidia.com/cuda/cuda-samples/index.html#cuda-parallel-reduction. This problem is very similar to yours, but instead of finding the sum, you want to find the minimum.

If you want to find the location of the minimum, as well as the value, you can do this also using Thrust:

#include <thrust/extrema.h>
thrust::device_ptr<float> dev_ptr(measurements); // measurements is allocated using cudaMalloc() and has 'size' elements
thrust::device_ptr<float> min_pos = thrust::min_element(dev_ptr, dev_ptr + size);
unsigned abs_pos = min_pos - dev_ptr;
float min_val=-1;
cudaMemcpy(&min_val, measurements+abs_pos, sizeof(float), cudaMemcpyDeviceToHost);

Thank you for your reply.

What I don’t understand is how to get all the values from each thread into the min array. It cannot be possible to find the minimum when I don’t have access to the values?

basically you will be doing a reduction/scan to collect a local min per thread block, then repeat the process to get the overall min.

This is basic concept of parallel programming, and Google CUDA reduction for examples.

You also could just use thrust::min_element() which is implemented the same way, Gert-Jan gave you a correct example.

The code Gert-Jan proposed did work for finding the minimum of the measurements. But the measurements are already stored in an array. None of the examples I’ve found uses the thrust library to find the minimum value of a variable calculated in the kernel using multiple threads. I tried to pass the MAD variable as a pointer to the kernel like this

__global__ void kernel (int m, int n, int h, int N, int *f, float heading, float *measurements, int *global_min, float *dev_MAD) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;

    float theta=heading*(PI/180);
    float fval = 0;

    // Calculate how much to move in x and y direction
    float offset_x = h*cos(theta);
    float offset_y = -h*sin(theta); 

    //Calculate Mean Absolute Difference
    if(idx < n && idy < m)
    {
        for(float g=0; g<N; g++)
        {
            float fval = tex2D (tex, idx+(g-2)*offset_x+0.5f, idy+(g-2)*offset_y+0.5f);
	   *dev_MAD += abs(measurements[(int)g]-fval); 
        }
    }
    cuPrintf("%.2f \n",MAD);
}

And use the thrust library

kernel <<< dimGrid,dimBlock >>> (m, n, h, N, dev_results, heading, dev_measurements, global_min, dev_MAD);
	
thrust::device_ptr<float> dev_ptr(dev_MAD); 
thrust::device_ptr<float> min_pos = thrust::min_element(dev_ptr, dev_ptr + n*m);
int abs_pos = min_pos - dev_ptr;
float min_val=min_pos[0];
	
cudaMemcpy(&min_val, dev_MAD+abs_pos, sizeof(float), cudaMemcpyDeviceToHost);

// Print out the result
printf("Min=%.2f pos=%d\n",min_val,abs_pos);

The ouput I get is: Min=-207521258711807190000000000000000000000.00 pos=0

Can anyone see what I’m doing wrong?

The atomicMin function worked when I changed the kernel to

__global__ void kernel (int m, int n, int h, int N, int *f, float heading, float *measurements, int *global_min) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int idy = blockIdx.y * blockDim.y + threadIdx.y;

    float MAD=0;
    float pos[2];
    float theta=heading*(PI/180);
    float fval = 0;

    // Calculate how much to move in x and y direction
    float offset_x = h*cos(theta);
    float offset_y = -h*sin(theta); 
	
    //Calculate Mean Absolute Difference
    if(idx < n && idy < m)
    {
        for(float g=0; g<N; g++)
        {
            float fval = tex2D (tex, idx+(g-2)*offset_x+0.5f, idy+(g-2)*offset_y+0.5f);
	   MAD += abs(measurements[(int)g]-fval); 
        }
    }
    cuPrintf("%.2f \n",MAD);

    atomicMin(global_min, MAD);
    pos[0]=idx;
    pos[1]=idy;	

    f[0]=*global_min;
    f[1]=pos[0];
    f[2]=pos[1];
}

Is there any way to make the atomicMin funtion return the location of the min value? It would make it much easier