I want to ask a shared memory efficiency question in Sort.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
#include <iostream>
#include <cstring>
using namespace std;
#include <conio.h>
#include <cuda.h>
#include <time.h>



#define N 32768
#define THREADS 512
#define BLOCKS 64


__global__ void mergeBlocks(int *a, int *temp, int sortedsize)
{



	int id = blockIdx.x;


	int index1 = id * 2 * sortedsize;
	int endIndex1 = index1 + sortedsize;
	int index2 = endIndex1;
	int endIndex2 = index2 + sortedsize;
	int targetIndex = id * 2 * sortedsize;
	int done = 0;



	while (!done) {

		if ((index1 == endIndex1) && (index2 < endIndex2)) 
			temp[targetIndex++] = a[index2++];
		

		else if ((index2 == endIndex2) && (index1 < endIndex1)) 
			temp[targetIndex++] = a[index1++];
		

		else if (a[index1] < a[index2]) 
			temp[targetIndex++] = a[index1++];
		

		else 
			temp[targetIndex++] = a[index2++];
		

		if ((index1 == endIndex1) && (index2 == endIndex2))
			done = 1;

	}

}








__global__ void sortBlocks(int *a)
{
	int i = 2;
	__shared__ int temp[THREADS];

	while (i <= THREADS)
	{
		if ((threadIdx.x % i) == 0)
		{
			int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
			int endIndex1 = index1 + i / 2;
			int index2 = endIndex1;
			int endIndex2 = index2 + i / 2;
			int targetIndex = threadIdx.x;
			int done = 0;

			while (!done)
			{
				if ((index1 == endIndex1) && (index2 < endIndex2))
					temp[targetIndex++] = a[index2++];

				else if ((index2 == endIndex2) && (index1 < endIndex1))
					temp[targetIndex++] = a[index1++];

				else if (a[index1] < a[index2])
					temp[targetIndex++] = a[index1++];

				else
					temp[targetIndex++] = a[index2++];

				if ((index1 == endIndex1) && (index2 == endIndex2))
					done = 1;

			}

		}
		__syncthreads();

		a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
		__syncthreads();

		i *= 2;

	}
}





int main()
{
	int a[N];
	int *dev_a;
	int *dev_temp;



	int blocks = BLOCKS / 2;
	int sortedsize = THREADS;


	cudaMalloc((void**)& dev_a, N * sizeof(int));
	cudaMalloc((void**)& dev_temp, N * sizeof(int));


	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
		printf("%d\n", a[i]);
	}
	//printf("\n");

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);


	// ------------------시간측정시작-----------------------------------------------

	//시간측정
	cudaEvent_t start, stop;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	//시간측정시작1
	cudaEventRecord(start, 0);



	sortBlocks << < BLOCKS, THREADS >> > (dev_a);


	//시간측정종료1
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float elapsedTime1;
	cudaEventElapsedTime(&elapsedTime1, start, stop);





	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);



	//시간측정시작2
	cudaEventRecord(start, 0);


	while (blocks > 0)
	{
		mergeBlocks << < blocks, 1 >> > (dev_a, dev_temp, sortedsize);
		cudaMemcpy(dev_a, dev_temp, N * sizeof(int), cudaMemcpyDeviceToDevice);
		blocks /= 2;
		sortedsize *= 2;
	}


	//시간측정종료2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float elapsedTime2;
	cudaEventElapsedTime(&elapsedTime2, start, stop);

	cudaEventDestroy(start);
	cudaEventDestroy(stop);


	//----------------------시간측정종료----------------------------------------------


	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);


	cudaFree(dev_a);
	cudaFree(dev_temp);


	for (int i = 0; i < N; i++) {
		printf("MergeSort result = %d \n", a[i]);
	}

	printf("Time to generate: %3.1f ms\n", elapsedTime1 + elapsedTime2);


	getchar();
	return 0;
}

Hello! I’m a student studying CUDA.

There is no difference in time between using and not using shared memory in the global sortBlocks function i used.

What are the advantages of shared memory and how do I use it to efficiently use shared memory in my coding?

Thank you for your reply. Have a nice day!

You’re reading each item once, and writing each item once. Shared memory won’t help there.

Shared is for:

  • data reuse scenarios (read once, use it several times)
  • data communication scenarios (thread 0 reads a value, then thread 1 uses that value)

Thank you for your answer.

Unlike the + operation, is it difficult to see the effect of shared memory in Sort?

Thank you for your answer.
I don’t know what you mean. Can you explain in detail, for example?