I want to know how to efficiently sort using shared memory.

11friends1119 · October 11, 2019, 11:54am

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
#include <iostream>
#include <cstring>
using namespace std;
#include <conio.h>
#include <cuda.h>
#include <time.h>



#define N 32768
#define THREADS 512
#define BLOCKS 64


__global__ void mergeBlocks(int *a, int *temp, int sortedsize)
{



	int id = blockIdx.x;


	int index1 = id * 2 * sortedsize;
	int endIndex1 = index1 + sortedsize;
	int index2 = endIndex1;
	int endIndex2 = index2 + sortedsize;
	int targetIndex = id * 2 * sortedsize;
	int done = 0;



	while (!done) {

		if ((index1 == endIndex1) && (index2 < endIndex2)) 
			temp[targetIndex++] = a[index2++];
		

		else if ((index2 == endIndex2) && (index1 < endIndex1)) 
			temp[targetIndex++] = a[index1++];
		

		else if (a[index1] < a[index2]) 
			temp[targetIndex++] = a[index1++];
		

		else 
			temp[targetIndex++] = a[index2++];
		

		if ((index1 == endIndex1) && (index2 == endIndex2))
			done = 1;

	}

}








__global__ void sortBlocks(int *a)
{
	int i = 2;
	__shared__ int temp[THREADS];

	while (i <= THREADS)
	{
		if ((threadIdx.x % i) == 0)
		{
			int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
			int endIndex1 = index1 + i / 2;
			int index2 = endIndex1;
			int endIndex2 = index2 + i / 2;
			int targetIndex = threadIdx.x;
			int done = 0;

			while (!done)
			{
				if ((index1 == endIndex1) && (index2 < endIndex2))
					temp[targetIndex++] = a[index2++];

				else if ((index2 == endIndex2) && (index1 < endIndex1))
					temp[targetIndex++] = a[index1++];

				else if (a[index1] < a[index2])
					temp[targetIndex++] = a[index1++];

				else
					temp[targetIndex++] = a[index2++];

				if ((index1 == endIndex1) && (index2 == endIndex2))
					done = 1;

			}

		}
		__syncthreads();

		a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
		__syncthreads();

		i *= 2;

	}
}





int main()
{
	int a[N];
	int *dev_a;
	int *dev_temp;



	int blocks = BLOCKS / 2;
	int sortedsize = THREADS;


	cudaMalloc((void**)& dev_a, N * sizeof(int));
	cudaMalloc((void**)& dev_temp, N * sizeof(int));


	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
		printf("%d\n", a[i]);
	}
	//printf("\n");

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);


	// ------------------시간측정시작-----------------------------------------------

	//시간측정
	cudaEvent_t start, stop;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	//시간측정시작1
	cudaEventRecord(start, 0);



	sortBlocks << < BLOCKS, THREADS >> > (dev_a);


	//시간측정종료1
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float elapsedTime1;
	cudaEventElapsedTime(&elapsedTime1, start, stop);





	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);



	//시간측정시작2
	cudaEventRecord(start, 0);


	while (blocks > 0)
	{
		mergeBlocks << < blocks, 1 >> > (dev_a, dev_temp, sortedsize);
		cudaMemcpy(dev_a, dev_temp, N * sizeof(int), cudaMemcpyDeviceToDevice);
		blocks /= 2;
		sortedsize *= 2;
	}


	//시간측정종료2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	float elapsedTime2;
	cudaEventElapsedTime(&elapsedTime2, start, stop);

	cudaEventDestroy(start);
	cudaEventDestroy(stop);


	//----------------------시간측정종료----------------------------------------------


	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);


	cudaFree(dev_a);
	cudaFree(dev_temp);


	for (int i = 0; i < N; i++) {
		printf("MergeSort result = %d \n", a[i]);
	}

	printf("Time to generate: %3.1f ms\n", elapsedTime1 + elapsedTime2);


	getchar();
	return 0;
}

Hello! I’m a student studying CUDA.

There is no difference in time between using and not using shared memory in the global sortBlocks function i used.

What are the advantages of shared memory and how do I use it to efficiently use shared memory in my coding?

I asked the same question before, but I’ve been asked to use shared memory multiple times,

but I don’t know exactly what that means. Can you give me a concrete example?

Thanks for reading. Have a nice day!

Topic		Replies	Views
I want to ask a shared memory efficiency question in Sort. CUDA Programming and Performance	3	527	October 11, 2019
In Merge sort , the sum of the blocks does not work CUDA Programming and Performance	3	544	February 22, 2019
problem in timing of GPU work CUDA Programming and Performance	5	814	September 11, 2015
How to perform CUDA programming to calculate and process the correct planning CUDA Programming and Performance	3	522	April 28, 2018
How to modify the Cuda output CUDA Programming and Performance	0	447	May 21, 2018
execution time on CPU and GPU CUDA Programming and Performance	6	1134	February 26, 2015
Problems when using shared memory CUDA Programming and Performance cuda	1	320	May 15, 2024
cuda integer operations and simt for sorting CUDA Programming and Performance	7	8876	July 25, 2009
Why cannot run this program CUDA Programming and Performance	7	955	May 11, 2018
copy from 1D array to shared memory matrix in cuda CUDA Programming and Performance	7	2061	June 9, 2015

I want to know how to efficiently sort using shared memory.

Related topics