#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
#include <iostream>
#include <cstring>
using namespace std;
#include <conio.h>
#include <cuda.h>
#include <time.h>
#define N 32768
#define THREADS 512
#define BLOCKS 64
__global__ void mergeBlocks(int *a, int *temp, int sortedsize)
{
int id = blockIdx.x;
int index1 = id * 2 * sortedsize;
int endIndex1 = index1 + sortedsize;
int index2 = endIndex1;
int endIndex2 = index2 + sortedsize;
int targetIndex = id * 2 * sortedsize;
int done = 0;
while (!done) {
if ((index1 == endIndex1) && (index2 < endIndex2))
temp[targetIndex++] = a[index2++];
else if ((index2 == endIndex2) && (index1 < endIndex1))
temp[targetIndex++] = a[index1++];
else if (a[index1] < a[index2])
temp[targetIndex++] = a[index1++];
else
temp[targetIndex++] = a[index2++];
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
__global__ void sortBlocks(int *a)
{
int i = 2;
__shared__ int temp[THREADS];
while (i <= THREADS)
{
if ((threadIdx.x % i) == 0)
{
int index1 = threadIdx.x + (blockIdx.x * blockDim.x);
int endIndex1 = index1 + i / 2;
int index2 = endIndex1;
int endIndex2 = index2 + i / 2;
int targetIndex = threadIdx.x;
int done = 0;
while (!done)
{
if ((index1 == endIndex1) && (index2 < endIndex2))
temp[targetIndex++] = a[index2++];
else if ((index2 == endIndex2) && (index1 < endIndex1))
temp[targetIndex++] = a[index1++];
else if (a[index1] < a[index2])
temp[targetIndex++] = a[index1++];
else
temp[targetIndex++] = a[index2++];
if ((index1 == endIndex1) && (index2 == endIndex2))
done = 1;
}
}
__syncthreads();
a[threadIdx.x + (blockIdx.x * blockDim.x)] = temp[threadIdx.x];
__syncthreads();
i *= 2;
}
}
int main()
{
int a[N];
int *dev_a;
int *dev_temp;
int blocks = BLOCKS / 2;
int sortedsize = THREADS;
cudaMalloc((void**)& dev_a, N * sizeof(int));
cudaMalloc((void**)& dev_temp, N * sizeof(int));
srand(time(NULL));
for (int i = 0; i < N; i++)
{
int num = rand() % 100;
a[i] = num;
printf("%d\n", a[i]);
}
//printf("\n");
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
// ------------------시간측정시작-----------------------------------------------
//시간측정
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//시간측정시작1
cudaEventRecord(start, 0);
sortBlocks << < BLOCKS, THREADS >> > (dev_a);
//시간측정종료1
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime1;
cudaEventElapsedTime(&elapsedTime1, start, stop);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
//시간측정시작2
cudaEventRecord(start, 0);
while (blocks > 0)
{
mergeBlocks << < blocks, 1 >> > (dev_a, dev_temp, sortedsize);
cudaMemcpy(dev_a, dev_temp, N * sizeof(int), cudaMemcpyDeviceToDevice);
blocks /= 2;
sortedsize *= 2;
}
//시간측정종료2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime2;
cudaEventElapsedTime(&elapsedTime2, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//----------------------시간측정종료----------------------------------------------
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_temp);
for (int i = 0; i < N; i++) {
printf("MergeSort result = %d \n", a[i]);
}
printf("Time to generate: %3.1f ms\n", elapsedTime1 + elapsedTime2);
getchar();
return 0;
}
Hello! I’m a student studying CUDA.
There is no difference in time between using and not using shared memory in the global sortBlocks function i used.
What are the advantages of shared memory and how do I use it to efficiently use shared memory in my coding?
I asked the same question before, but I’ve been asked to use shared memory multiple times,
but I don’t know exactly what that means. Can you give me a concrete example?
Thanks for reading. Have a nice day!