I’ve been playing a bit more with the code and added 2 modified approaches. The first 3 implementations are essentially the same as above, but I am using random data. My image validation is comparing image pairs for identity between the methods. The added 2 implementations modify the sorting method from a bubble sort to a rank-order sort (time 4) and a brute-force sorting networks based approach (time 5). The added two implementations both extensively use macros to assist with code generation. Here’s a fully worked example:

```
$ cat t650.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define IMW 1024
#define IMH 1024
#define BX 32
#define BY 8
#define nTPB (BX*BY)
#define CHECK_VAL 10
#define G(A,B) (array[(A*nTPB)+(threadIdx.y*BX)+threadIdx.x]>array[(B*nTPB)+(threadIdx.y*BX)+threadIdx.x])
#define E(A,B) (array[(A*nTPB)+(threadIdx.y*BX)+threadIdx.x]>=array[(B*nTPB)+(threadIdx.y*BX)+threadIdx.x])
#define SWAP(A,B) if (array[(A*nTPB)+(threadIdx.y*BX)+threadIdx.x]<array[(B*nTPB)+(threadIdx.y*BX)+threadIdx.x])\
{ unsigned char tmp = array[(A*nTPB)+(threadIdx.y*BX)+threadIdx.x]; \
array[(A*nTPB)+(threadIdx.y*BX)+threadIdx.x] = array[(B*nTPB)+(threadIdx.y*BX)+threadIdx.x]; \
array[(B*nTPB)+(threadIdx.y*BX)+threadIdx.x] = tmp;}
// IMN is assumed to be odd, for convenience of median filter
#define IMN 21
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
timeval tv1;
gettimeofday(&tv1,0);
return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
int validate(unsigned char *data1, unsigned char *data2){
int pass = 1;
for (int i = 0; i < IMH; i++)
for (int j = 0; j < IMW; j++)
if (data1[i*IMW+j] != data2[i*IMW+j])
if (pass) { printf("image error: (%d,%d): %d:%d\n", i,j,data1[i*IMW+j], data2[i*IMW+j]); pass = 0;}
return pass;
}
// template parameter is expected to be either unsigned char or uchar4 only
template <typename T>
__global__ void filter( unsigned char* frames, unsigned char* result)
{
__shared__ unsigned char array[IMN*nTPB*sizeof(T)];
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
int num = x + y * (IMW/sizeof(T));
T *my_frames = reinterpret_cast<T *>(frames);
T *my_array = reinterpret_cast<T *>(array);
if ((x < IMW/sizeof(T)) && (y < IMH)){
for (int i = 0; i < IMN; i++){
my_array[(i*nTPB)+(threadIdx.y*BX)+threadIdx.x] = my_frames[(i * (IMW*IMH/sizeof(T))) + ( num)];
}
for (int bpp = 0; bpp < (sizeof(T)); bpp++)
for (int j = 0; j < ((IMN/2)+1); j++){
int num2 = j;
for (int k = j + 1; k < IMN; k++){
if (array[(k*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp] < array[(num2*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp]){
num2 = k;
}
}
unsigned char b = array[(j*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp];
array[(j*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp] = array[(num2*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp];
array[(num2*nTPB*sizeof(T))+(threadIdx.y*BX*sizeof(T))+threadIdx.x*sizeof(T)+bpp] = b;
}
T *my_result = reinterpret_cast<T *>(result);
my_result[(num)] = my_array[((IMN/2)*nTPB) + (threadIdx.y*BX)+threadIdx.x];
}
}
__global__ void filter2( unsigned char* frames, unsigned char* result)
{
__shared__ unsigned char array[IMN*nTPB];
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
int num = x + y * (IMW);
if ((x < IMW) && (y < IMH)){
for (int i = 0; i < IMN; i++)
array[(i*nTPB)+(threadIdx.y*BX)+threadIdx.x] = frames[(i * (IMW*IMH)) + ( num)];
int ot;
if (10 == G(0,1) +G(0,2) +G(0,3) +G(0,4) +G(0,5) +G(0,6) +G(0,7) +G(0,8) +G(0,9) +G(0,10) +G(0,11) +G(0,12) +G(0,13) +G(0,14) +G(0,15) +G(0,16) +G(0,17) +G(0,18) +G(0,19) +G(0,20))
ot = 0;
else if (10 == E(1,0) +G(1,2) +G(1,3) +G(1,4) +G(1,5) +G(1,6) +G(1,7) +G(1,8) +G(1,9) +G(1,10) +G(1,11) +G(1,12) +G(1,13) +G(1,14) +G(1,15) +G(1,16) +G(1,17) +G(1,18) +G(1,19) +G(1,20))
ot = 1;
else if (10 == E(2,0) +E(2,1) +G(2,3) +G(2,4) +G(2,5) +G(2,6) +G(2,7) +G(2,8) +G(2,9) +G(2,10) +G(2,11) +G(2,12) +G(2,13) +G(2,14) +G(2,15) +G(2,16) +G(2,17) +G(2,18) +G(2,19) +G(2,20))
ot = 2;
else if (10 == E(3,0) +E(3,1) +E(3,2) +G(3,4) +G(3,5) +G(3,6) +G(3,7) +G(3,8) +G(3,9) +G(3,10) +G(3,11) +G(3,12) +G(3,13) +G(3,14) +G(3,15) +G(3,16) +G(3,17) +G(3,18) +G(3,19) +G(3,20))
ot = 3;
else if (10 == E(4,0) +E(4,1) +E(4,2) +E(4,3) +G(4,5) +G(4,6) +G(4,7) +G(4,8) +G(4,9) +G(4,10) +G(4,11) +G(4,12) +G(4,13) +G(4,14) +G(4,15) +G(4,16) +G(4,17) +G(4,18) +G(4,19) +G(4,20))
ot = 4;
else if (10 == E(5,0) +E(5,1) +E(5,2) +E(5,3) +E(5,4) +G(5,6) +G(5,7) +G(5,8) +G(5,9) +G(5,10) +G(5,11) +G(5,12) +G(5,13) +G(5,14) +G(5,15) +G(5,16) +G(5,17) +G(5,18) +G(5,19) +G(5,20))
ot = 5;
else if (10 == E(6,0) +E(6,1) +E(6,2) +E(6,3) +E(6,4) +E(6,5) +G(6,7) +G(6,8) +G(6,9) +G(6,10) +G(6,11) +G(6,12) +G(6,13) +G(6,14) +G(6,15) +G(6,16) +G(6,17) +G(6,18) +G(6,19) +G(6,20))
ot = 6;
else if (10 == E(7,0) +E(7,1) +E(7,2) +E(7,3) +E(7,4) +E(7,5) +E(7,6) +G(7,8) +G(7,9) +G(7,10) +G(7,11) +G(7,12) +G(7,13) +G(7,14) +G(7,15) +G(7,16) +G(7,17) +G(7,18) +G(7,19) +G(7,20))
ot = 7;
else if (10 == E(8,0) +E(8,1) +E(8,2) +E(8,3) +E(8,4) +E(8,5) +E(8,6) +E(8,7) +G(8,9) +G(8,10) +G(8,11) +G(8,12) +G(8,13) +G(8,14) +G(8,15) +G(8,16) +G(8,17) +G(8,18) +G(8,19) +G(8,20))
ot = 8;
else if (10 == E(9,0) +E(9,1) +E(9,2) +E(9,3) +E(9,4) +E(9,5) +E(9,6) +E(9,7) +E(9,8) +G(9,10) +G(9,11) +G(9,12) +G(9,13) +G(9,14) +G(9,15) +G(9,16) +G(9,17) +G(9,18) +G(9,19) +G(9,20))
ot = 9;
else if (10 == E(10,0)+E(10,1)+E(10,2)+E(10,3)+E(10,4)+E(10,5)+E(10,6)+E(10,7)+E(10,8)+E(10,9) +G(10,11)+G(10,12)+G(10,13)+G(10,14)+G(10,15)+G(10,16)+G(10,17)+G(10,18)+G(10,19)+G(10,20))
ot = 10;
else if (10 == E(11,0)+E(11,1)+E(11,2)+E(11,3)+E(11,4)+E(11,5)+E(11,6)+E(11,7)+E(11,8)+E(11,9) +E(11,10)+G(11,12)+G(11,13)+G(11,14)+G(11,15)+G(11,16)+G(11,17)+G(11,18)+G(11,19)+G(11,20))
ot = 11;
else if (10 == E(12,0)+E(12,1)+E(12,2)+E(12,3)+E(12,4)+E(12,5)+E(12,6)+E(12,7)+E(12,8)+E(12,9) +E(12,10)+E(12,11)+G(12,13)+G(12,14)+G(12,15)+G(12,16)+G(12,17)+G(12,18)+G(12,19)+G(12,20))
ot = 12;
else if (10 == E(13,0)+E(13,1)+E(13,2)+E(13,3)+E(13,4)+E(13,5)+E(13,6)+E(13,7)+E(13,8)+E(13,9) +E(13,10)+E(13,11)+E(13,12)+G(13,14)+G(13,15)+G(13,16)+G(13,17)+G(13,18)+G(13,19)+G(13,20))
ot = 13;
else if (10 == E(14,0)+E(14,1)+E(14,2)+E(14,3)+E(14,4)+E(14,5)+E(14,6)+E(14,7)+E(14,8)+E(14,9) +E(14,10)+E(14,11)+E(14,12)+E(14,13)+G(14,15)+G(14,16)+G(14,17)+G(14,18)+G(14,19)+G(14,20))
ot = 14;
else if (10 == E(15,0)+E(15,1)+E(15,2)+E(15,3)+E(15,4)+E(15,5)+E(15,6)+E(15,7)+E(15,8)+E(15,9) +E(15,10)+E(15,11)+E(15,12)+E(15,13)+E(15,14)+G(15,16)+G(15,17)+G(15,18)+G(15,19)+G(15,20))
ot = 15;
else if (10 == E(16,0)+E(16,1)+E(16,2)+E(16,3)+E(16,4)+E(16,5)+E(16,6)+E(16,7)+E(16,8)+E(16,9) +E(16,10)+E(16,11)+E(16,12)+E(16,13)+E(16,14)+E(16,15)+G(16,17)+G(16,18)+G(16,19)+G(16,20))
ot = 16;
else if (10 == E(17,0)+E(17,1)+E(17,2)+E(17,3)+E(17,4)+E(17,5)+E(17,6)+E(17,7)+E(17,8)+E(17,9) +E(17,10)+E(17,11)+E(17,12)+E(17,13)+E(17,14)+E(17,15)+E(17,16)+G(17,18)+G(17,19)+G(17,20))
ot = 17;
else if (10 == E(18,0)+E(18,1)+E(18,2)+E(18,3)+E(18,4)+E(18,5)+E(18,6)+E(18,7)+E(18,8)+E(18,9) +E(18,10)+E(18,11)+E(18,12)+E(18,13)+E(18,14)+E(18,15)+E(18,16)+E(18,17)+G(18,19)+G(18,20))
ot = 18;
else if (10 == E(19,0)+E(19,1)+E(19,2)+E(19,3)+E(19,4)+E(19,5)+E(19,6)+E(19,7)+E(19,8)+E(19,9) +E(19,10)+E(19,11)+E(19,12)+E(19,13)+E(19,14)+E(19,15)+E(19,16)+E(19,17)+E(19,18)+G(19,20))
ot = 19;
else
ot = 20;
result[(num)] = array[(ot*nTPB) + (threadIdx.y*BX)+threadIdx.x];
}
}
__global__ void filter3( unsigned char* frames, unsigned char* result)
{
__shared__ unsigned char array[IMN*nTPB];
int x = threadIdx.x+blockDim.x*blockIdx.x;
int y = threadIdx.y+blockDim.y*blockIdx.y;
int num = x + y * (IMW);
if ((x < IMW) && (y < IMH)){
for (int i = 0; i < IMN; i++)
array[(i*nTPB)+(threadIdx.y*BX)+threadIdx.x] = frames[(i * (IMW*IMH)) + ( num)];
SWAP(0, 1);
SWAP(3, 4);
SWAP(2, 4);
SWAP(2, 3);
SWAP(0, 3);
SWAP(0, 2);
SWAP(1, 4);
SWAP(1, 3);
SWAP(1, 2);
SWAP(5, 6);
SWAP(8, 9);
SWAP(7, 9);
SWAP(7, 8);
SWAP(5, 8);
SWAP(5, 7);
SWAP(6, 9);
SWAP(6, 8);
SWAP(6, 7);
SWAP(0, 5);
SWAP(1, 6);
SWAP(1, 5);
SWAP(2, 7);
SWAP(3, 8);
SWAP(4, 9);
SWAP(4, 8);
SWAP(3, 7);
SWAP(4, 7);
SWAP(2, 5);
SWAP(3, 6);
SWAP(4, 6);
SWAP(3, 5);
SWAP(4, 5);
SWAP(10, 11);
SWAP(13, 14);
SWAP(12, 14);
SWAP(12, 13);
SWAP(10, 13);
SWAP(10, 12);
SWAP(11, 14);
SWAP(11, 13);
SWAP(11, 12);
SWAP(16, 17);
SWAP(15, 17);
SWAP(15, 16);
SWAP(19, 20);
SWAP(18, 20);
SWAP(18, 19);
SWAP(15, 18);
SWAP(16, 19);
SWAP(17, 20);
SWAP(17, 19);
SWAP(16, 18);
SWAP(17, 18);
SWAP(10, 16);
SWAP(10, 15);
SWAP(11, 17);
SWAP(11, 16);
SWAP(11, 15);
SWAP(12, 18);
SWAP(13, 19);
SWAP(14, 20);
SWAP(14, 19);
SWAP(13, 18);
SWAP(14, 18);
SWAP(12, 15);
SWAP(13, 16);
SWAP(14, 17);
SWAP(14, 16);
SWAP(13, 15);
SWAP(14, 15);
SWAP(0, 11);
SWAP(0, 10);
SWAP(1, 12);
SWAP(1, 11);
SWAP(1, 10);
SWAP(2, 13);
SWAP(3, 14);
SWAP(4, 15);
SWAP(4, 14);
SWAP(3, 13);
SWAP(4, 13);
SWAP(2, 10);
SWAP(3, 11);
SWAP(4, 12);
SWAP(4, 11);
SWAP(3, 10);
SWAP(4, 10);
SWAP(5, 16);
SWAP(6, 17);
SWAP(6, 16);
SWAP(7, 18);
SWAP(8, 19);
SWAP(9, 20);
SWAP(9, 19);
SWAP(8, 18);
SWAP(9, 18);
SWAP(7, 16);
SWAP(8, 17);
SWAP(9, 17);
SWAP(8, 16);
SWAP(9, 16);
SWAP(5, 11);
SWAP(5, 10);
SWAP(6, 12);
SWAP(6, 11);
SWAP(6, 10);
SWAP(7, 13);
SWAP(8, 14);
SWAP(9, 15);
SWAP(9, 14);
SWAP(8, 13);
SWAP(9, 13);
SWAP(7, 10);
SWAP(8, 11);
SWAP(9, 12);
SWAP(9, 11);
SWAP(8, 10);
SWAP(9, 10);
result[(num)] = array[(10*nTPB) + (threadIdx.y*BX)+threadIdx.x];
}
}
int main(){
unsigned char *din, *dout1, *dout2, *dout3, *dout4, *dout5;
unsigned char *hin, *hout1, *hout2, *hout3, *hout4, *hout5;
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
hin = (unsigned char *)malloc(IMW*IMH*IMN*sizeof(unsigned char));
hout1 = (unsigned char *)malloc(IMW*IMH*sizeof(unsigned char));
hout2 = (unsigned char *)malloc(IMW*IMH*sizeof(unsigned char));
hout3 = (unsigned char *)malloc(IMW*IMH*sizeof(unsigned char));
hout4 = (unsigned char *)malloc(IMW*IMH*sizeof(unsigned char));
hout5 = (unsigned char *)malloc(IMW*IMH*sizeof(unsigned char));
cudaMalloc(&din, IMW*IMH*IMN*sizeof(unsigned char));
cudaMalloc(&dout1, IMW*IMH*sizeof(unsigned char));
cudaMalloc(&dout2, IMW*IMH*sizeof(unsigned char));
cudaMalloc(&dout3, IMW*IMH*sizeof(unsigned char));
cudaMalloc(&dout4, IMW*IMH*sizeof(unsigned char));
cudaMalloc(&dout5, IMW*IMH*sizeof(unsigned char));
for (int i = 0; i < IMN; i++)
for (int j = 0; j< IMH*IMW; j++)
//hin[i*IMH*IMW+j] = (i+11)%IMN;
hin[i*IMH*IMW+j] = rand()%255;
memset(hout1, 0, IMW*IMH*sizeof(unsigned char));
memset(hout2, 0, IMW*IMH*sizeof(unsigned char));
memset(hout3, 0, IMW*IMH*sizeof(unsigned char));
memset(hout4, 0, IMW*IMH*sizeof(unsigned char));
memset(hout5, 0, IMW*IMH*sizeof(unsigned char));
cudaMemcpy(din, hin, IMW*IMH*IMN*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemset(dout1, 0, IMW*IMH*sizeof(unsigned char));
cudaMemset(dout2, 0, IMW*IMH*sizeof(unsigned char));
cudaMemset(dout3, 0, IMW*IMH*sizeof(unsigned char));
cudaMemset(dout4, 0, IMW*IMH*sizeof(unsigned char));
cudaMemset(dout5, 0, IMW*IMH*sizeof(unsigned char));
cudaCheckErrors("setup");
unsigned long long my_time = dtime_usec(0);
filter<unsigned char><<<dim3(IMW,IMH), 1>>>(din, dout1);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
my_time = dtime_usec(my_time);
printf("time 1 = %fs\n", my_time/(float)USECPSEC);
cudaMemcpy(hout1, dout1, IMW*IMH*sizeof(unsigned char), cudaMemcpyDeviceToHost);
// if (!validate(hout1)) {printf("image 1 fail\n"); }
my_time = dtime_usec(0);
filter<unsigned char><<<dim3(IMW/BX,IMH/BY), dim3(BX,BY)>>>(din, dout2);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
my_time = dtime_usec(my_time);
printf("time 2 = %fs\n", my_time/(float)USECPSEC);
cudaMemcpy(hout2, dout2, IMW*IMH*sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (!validate(hout1, hout2)) {printf("image 2 fail\n"); }
my_time = dtime_usec(0);
filter<uchar4><<<dim3(IMW/(4*BX),IMH/BY), dim3(BX,BY)>>>(din, dout3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
my_time = dtime_usec(my_time);
printf("time 3 = %fs\n", my_time/(float)USECPSEC);
cudaMemcpy(hout3, dout3, IMW*IMH*sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (!validate(hout1, hout3)) {printf("image 3 fail\n"); }
my_time = dtime_usec(0);
filter2<<<dim3(IMW/BX,IMH/BY), dim3(BX,BY)>>>(din, dout4);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
my_time = dtime_usec(my_time);
printf("time 4 = %fs\n", my_time/(float)USECPSEC);
cudaMemcpy(hout4, dout4, IMW*IMH*sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (!validate(hout1, hout4)) {printf("image 4 fail\n"); }
my_time = dtime_usec(0);
filter3<<<dim3(IMW/BX,IMH/BY), dim3(BX,BY)>>>(din, dout5);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
my_time = dtime_usec(my_time);
printf("time 5 = %fs\n", my_time/(float)USECPSEC);
cudaMemcpy(hout5, dout5, IMW*IMH*sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (!validate(hout1, hout5)) {printf("image 5 fail\n"); }
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t650 t650.cu
$ ./t650
time 1 = 0.293081s
time 2 = 0.006445s
time 3 = 0.006166s
time 4 = 0.005773s
time 5 = 0.003592s
$
```

Speedup over 1 thread per block approach (#1):

2: 45x

3: 47x

4: 50x

5: 81x