CUDA Median of Array Operation

Hi, I am trying to get median of array. When i input in this array (3 2 5 8 9 4 1 7 6) it returns “0”.
Code using in terminal like : ./main input.txt 9
9 is number of entries.
Where am i doing wrong ? Could you help me?

CODE:

#include <iostream>
#include <fstream>
#include <cstdlib>

#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)



inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
        cudaError_t err = cudaGetLastError();
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

__device__ inline void swapGpu(int &a, int &b) {
        int dum = a;
        a = b;
        b = dum;
}

__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;

        if(i + 2 * blockDim.x < numEntries) {
                int list[3];
                list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
                if(list[1] < list[0])
                        swapGpu(list[1], list[0]);
                if(list[2] < list[0])
                        swapGpu(list[2], list[0]);
                if(list[2] < list[1])
                        swapGpu(list[2], list[1]);

                sdata[tid] = list[1];
        }

        __syncthreads();

        for(int s = blockDim.x / 3; s > 0; s /= 3) {
                if(tid < s && tid + 2 * s < blockDim.x) {
                        int list[3];
                        list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
                        if(list[1] < list[0])
                                swapGpu(list[1], list[0]);
                        if(list[2] < list[0])
                                swapGpu(list[2], list[0]);
                        if(list[2] < list[1])
                                swapGpu(list[2], list[1]);

                        sdata[tid] = list[1];
                }

                __syncthreads();
        }

        *med = sdata[0];
}

int main(int argc, char *argv[]) {
        if(argc != 3) {
                std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
                std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
                exit(1);
        }

        std::ifstream inp(argv[1]);

        if(!inp.is_open()) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Could not find file " << argv[1] << std::endl;
                exit(2);
        }

        int numEntries = atoi(argv[2]), i = 0;

        int *entries = new int[numEntries];

        while(inp >> entries[i] && i < numEntries)
                i++;

        if(i < numEntries) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
                exit(2);
        }

        if(inp >> i) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
                exit(2);
        }

        int *d_entries;

        checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
        checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));

        if(numEntries % 2) {
            std::cout << " if " << std::endl;
                int med, *d_med;
                checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));
                gpuMedOdd<<<9, numEntries / 9, numEntries / 9 * sizeof(int)>>>(d_entries, d_med, numEntries);
                getLastCudaError("kernel launch failure");

                checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));

                std::cout << "The median value is: " << med << std::endl;
        }
        else { 
            std::cout << " Else " << std::endl;
                int *d_med, med;
                cudaMalloc(&d_med, sizeof(int));
                gpuMedOdd<<<9, numEntries / 9, numEntries / 9 * sizeof(int)>>>(d_entries, d_med, numEntries);
                getLastCudaError("kernel launch failure");

                checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));

                std::cout << "The median value is: " << med << std::endl;
        }
        exit(0);
}

Perhaps you should explain the method or strategy you are trying to implement. Right now I see that you are launching 9 blocks of 1 thread each, and each block is writing to the output location *med. I don’t see how that could possibly be correct.

Median in my experience usually involves a sorting step. Is this some kind of sort or parallel sort you are trying to implement?

There are various topics on this board discussing median filtering, which may be of interest, and npp has median filtering functions.

Btw, if you have a small and fixed-size array (e.g. < 30 elements or so), you shold consider ‘sorting networks’ for calculating the median. You can generate code for it (for a specific array size N) automatically - see website http://pages.ripco.net/~jgamble/nw.html

Here’s an example of median filtering, a few slightly different approaches, one of which involves the sorting network definition lifted from the link provided by HannesF99

https://devtalk.nvidia.com/default/topic/797876/cuda-programming-and-performance/median-filter-time-dimension-for-images-bad-performance-no-memory-coalescence/post/4401300/#4401300

This is fairly trivial because the median filter (and thus the associated sort) is handled per-thread.

Sorry for nitpicking, but for completeness I’d like to point out that finding the median requires fewer comparisons than a full sorting network. How many exactly is an open research problem for n > 20, as I recall. For n=9 you may want to look at this question on Stackoverflow:

https://stackoverflow.com/questions/45453537/optimal-9-element-sorting-network-that-reduces-to-an-optimal-median-of-9-network

Thanks for your replies. At first i want to explain my strategy. I want to take median of all sorted rows in two-channel optical flow matrice. I started basic 1D arrays because i am not experienced about cuda programming. Also sorting is not important if it is parallel or not. I thought if i sort the array in reduction, i can find median too.

I think median filter is different for this strategy. Could this implementation work for 2 channel arrays? And i can not solve this kernel problemç What can i do different?

What application case do you have ? video stabilization ?
Note in NPP there are quite a few median functions, see https://docs.nvidia.com/cuda/npp/group__image__filter__median.html

Yes video stabilization but median filtering not work for this application.
Thanks for your contributions.