#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
#include <iostream>
#include <cstring>
using namespace std;
#include <conio.h>
#include <cuda.h>
#include <time.h>
#define N 8
#define THREADS 4
#define BLOCKS 2
__global__ void mergeBlocks(int *a, int *temp, int sortedsize)
{
int id = blockIdx.x;
int index1 = id * 2 * sortedsize;
int endIndex1 = index1 + sortedsize;
int index2 = endIndex1;
int endIndex2 = index2 + sortedsize;
int targetIndex = id * 2 * sortedsize;
int done = 0;
while (!done) {
if ((index1 == endIndex1) && (index2 < endIndex2)) {
temp[targetIndex++] = a[index2++];
}
else if ((index2 = endIndex2) && (index1 < endIndex1)) {
temp[targetIndex++] = a[index1++];
}
else if (a[index1] < a[index2]) {
temp[targetIndex++] = a[index1++];
}
else {
temp[targetIndex++] = a[index2]++;
}
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
__global__ void sortBlocks(int *a)
{
int i = 2;
__shared__ int temp[THREADS];
while (i <= THREADS)
{
if ((threadIdx.x % i) == 0)
{
int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
int endIndex1 = index1 + i / 2;
int index2 = endIndex1;
int endIndex2 = index2 + i / 2;
int targetIndex = threadIdx.x;
int done = 0;
while (!done)
{
if ((index1 == endIndex1) && (index2 < endIndex2))
temp[targetIndex++] = a[index2++];
else if ((index2 == endIndex2) && (index1 < endIndex1))
temp[targetIndex++] = a[index1++];
else if (a[index1] < a[index2])
temp[targetIndex++] = a[index1++];
else
temp[targetIndex++] = a[index2++];
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
__syncthreads();
a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
__syncthreads();
i *= 2;
}
}
int main()
{
int a[N];
int *dev_a;
int *dev_temp;
cudaMalloc((void**)& dev_a, N * sizeof(int));
cudaMalloc((void**)& dev_temp, N * sizeof(int));
srand(time(NULL));
for (int i = 0; i < N; i++)
{
int num = rand() % 100;
a[i] = num;
printf("%d\n", a[i]);
}
//printf("\n");
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
sortBlocks << < BLOCKS, THREADS >> > (dev_a);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
int blocks = BLOCKS / 2;
int sortedsize = THREADS;
while (blocks > 0)
{
mergeBlocks << < blocks, 1 >> > (dev_a, dev_temp, sortedsize );
cudaMemcpy(dev_a, dev_temp, N * sizeof(int), cudaMemcpyDeviceToDevice);
blocks /= 2;
sortedsize *= 2;
}
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_temp);
for (int i = 0; i < N; i++) {
printf("MergeSort result = %d \n", a[i]);
}
getchar();
return 0;
}
Threads within a block are sorted well. However, Sorting between blocks is not work!
forexample 7,2,5,1,9,8,6,0 (block2,thread4) ㅡ> result is 1,2,5,7(block1) 0,6,8,9(block2)
I think the ‘while’ in the mergeBlocks function seems to be not working well.
I would appreciate your attention to the problem.