In Merge sort , the sum of the blocks does not work

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
#include <iostream>
#include <cstring>
using namespace std;
#include <conio.h>
#include <cuda.h>
#include <time.h>

#define N 8
#define THREADS 4
#define BLOCKS 2

__global__ void mergeBlocks(int *a, int *temp, int sortedsize)
{

	

	int id = blockIdx.x;

	
	int index1 = id * 2 * sortedsize;
	int endIndex1 = index1 + sortedsize;
	int index2 = endIndex1;
	int endIndex2 = index2 + sortedsize;
	int targetIndex = id * 2 * sortedsize;
	int done = 0;

	

	while (!done) {

		if ((index1 == endIndex1) && (index2 < endIndex2)) {
			temp[targetIndex++] = a[index2++];
		}

		else if ((index2 = endIndex2) && (index1 < endIndex1)) {
			temp[targetIndex++] = a[index1++];
		}

		else if (a[index1] < a[index2]) {
			temp[targetIndex++] = a[index1++];
		}

		else {
			temp[targetIndex++] = a[index2]++;
		}

		if ((index1 == endIndex1) && (index2 == endIndex2))
			done = 1;

	}

}

__global__ void sortBlocks(int *a)
{
	int i = 2;
	__shared__ int temp[THREADS];

	while (i <= THREADS)
	{
		if ((threadIdx.x % i) == 0)
		{
			int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
			int endIndex1 = index1 + i / 2;
			int index2 = endIndex1;
			int endIndex2 = index2 + i / 2;
			int targetIndex = threadIdx.x;
			int done = 0;

			while (!done)
			{
				if ((index1 == endIndex1) && (index2 < endIndex2))
					temp[targetIndex++] = a[index2++];

				else if ((index2 == endIndex2) && (index1 < endIndex1))
					temp[targetIndex++] = a[index1++];

				else if (a[index1] < a[index2])
					temp[targetIndex++] = a[index1++];

				else
					temp[targetIndex++] = a[index2++];

				if ((index1 == endIndex1) && (index2 == endIndex2))
					done = 1;
				
			}

		}
		__syncthreads();
	
		a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
		__syncthreads();
		
		i *= 2;
	
	}
}

int main()
{
	int a[N];
	int *dev_a;
	int *dev_temp;

	cudaMalloc((void**)& dev_a, N * sizeof(int));
	cudaMalloc((void**)& dev_temp, N * sizeof(int));

	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
		printf("%d\n", a[i]);
	}
	//printf("\n");

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
	
	sortBlocks << < BLOCKS, THREADS >> > (dev_a);

	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);

	

	
	
	int blocks = BLOCKS / 2;
	int sortedsize = THREADS;

	
	
	while (blocks > 0)
	{
		mergeBlocks << < blocks, 1 >> > (dev_a, dev_temp, sortedsize );
		cudaMemcpy(dev_a, dev_temp, N * sizeof(int), cudaMemcpyDeviceToDevice);
		blocks /= 2;
		sortedsize *= 2;
	}

	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
	
	

	cudaFree(dev_a);
	cudaFree(dev_temp);

	for (int i = 0; i < N; i++) {
		printf("MergeSort result = %d \n", a[i]);
	}

	getchar();
	return 0;
}

Threads within a block are sorted well. However, Sorting between blocks is not work!

forexample 7,2,5,1,9,8,6,0 (block2,thread4) ㅡ> result is 1,2,5,7(block1) 0,6,8,9(block2)

I think the ‘while’ in the mergeBlocks function seems to be not working well.

I would appreciate your attention to the problem.

A given block is not aware of what the others are doing because the shared memory is local to the block.
When inter-block communication is needed, it is done with global memory.
Check the CUDA reduction code as it employs the intermediate results in global memory for the next kernel call.
Many problems require more than 1 call to the kernel or different kernels in succession.

in the future, when posting code, please use the code highlight feature.

select your code and press the </> button at the top of the edit window. I have fixed your posting here.

Thanks for the answers. It helped me a lot. I wish you luck in your life. Thanks for all the answers.