uses too much local data / uses too much shared data

Hi people,

I have a problem when I try to create a integer array with size greater than 4096.

I Know that the max size of local and shared memory in devices functions is 16KB, but I have no idea how to solve this.

I’m doing a comparison between CUDA, OpenMP, Pthreads and MPI using bitonic sort algorithm.

For now I can only use a block with four threads in CUDA.

I already implement the source in others plataforms.

Below my kernel source.

Thanks in advance for sugestion how to solve this.

#ifndef _BITONIC_KERNEL_CU_

#define _BITONIC_KERNEL_CU_

// Define the numbers of elements which be sorted

#define N_16384 1

#ifdef N_524288

	#define WIDTH 524288 //2^19

	#define ELEMENTS 131072

#elif N_262144

	#define WIDTH 262144 //2^18

	#define ELEMENTS 65536

#elif N_131072

	#define WIDTH 131072 //2^17

	#define ELEMENTS 32768

#elif N_65536

	#define WIDTH 65536 //2^16

	#define ELEMENTS 16384

#elif N_32768

	#define WIDTH 32768 //2^15

	#define ELEMENTS 8192

#elif N_16384

	#define WIDTH 16384 //2^14

	#define ELEMENTS 4096

#elif N_8192

	#define WIDTH 8192 //2^13

	#define ELEMENTS 2048

#elif N_4096

	#define WIDTH 4096 //2^12

	#define ELEMENTS 1024

#elif N_2048

	#define WIDTH 2048 //2^11

	#define ELEMENTS 512

#elif N_1024

	#define WIDTH 1024 //2^10

	#define ELEMENTS 256

#endif

//-----------------------------------------------------

// DEVICE FUNCTIONS

//----------------------------------------------------

__device__ inline void swap(int & a, int & b)

{

	// Alternative swap doesn't use a temporary register:

	a ^= b;

	b ^= a;

	a ^= b;

}

/**

 * Alternative memcpy

 */

__device__ void myMemcpy(int *dst, int *src, int size) {

	int i;

	for(i=0;i<size; i++) {

		swap(dst[i], src[i]);

	}

}

__device__ inline void insertionSort(int *a, int p, int r) {

	int i;

	for (i = p+1; i <= r; i++) {

	   int j = i, v = a[i];

	   while (p <= j-1 && v < a[j-1]) {

		  a[j] = a[j-1];

		  j--;

	   }

	   a[j] = v;

	}

}

__device__ inline void quickSort_it (int v[], int p, int r)

{

	insertionSort(v, p, r);

}

__global__ static void bitonicSort(int * vetor)

{

	int k, l, numprocs, myid;

	numprocs = (WIDTH / ELEMENTS);

	myid = threadIdx.x;

	int localVector[ELEMENTS * 2]; // ERROR OCCURS HERE

	// whatever if I declare

	// __shared__ int localVector[ELEMENTS * 2];

	quickSort_it(vetor, (myid * ELEMENTS), (myid * ELEMENTS) + ELEMENTS -1);

	// Deranged - Phase 1

	for (k = 1; k < numprocs / 2; k = k * 2)

	{//Control of phase 1

		if ((myid % (k * 4)) < (k * 2))

		{//SORT crescent

			if (myid % (k * 2) < k)

			{ //Process

				myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);

				myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS],  ELEMENTS);

				quickSort_it(localVector, 0,(ELEMENTS *2)-1);

				myMemcpy(&vetor[myid * ELEMENTS],localVector ,ELEMENTS);

				myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);

			}

		}

		else

		{ //Sort decrescent

			if (myid % (k * 2) < k)

			{ //Process

				myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);

				myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS],  ELEMENTS);

				quickSort_it(localVector, 0,(ELEMENTS *2)-1);

				myMemcpy(&vetor[(myid + k) * ELEMENTS],localVector , ELEMENTS);

				myMemcpy(&vetor[myid * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);

			}

		}

		__syncthreads();

		if (k > 1)

		{ //Adjust values

			for (l = k / 2; l >= 1;  l = l / 2)

			{

				if (myid % (k * 4) < (k * 2))

				{ //crescent

					if (myid % (l * 2) < l)

					{ //Process

						myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);

						myMemcpy(&localVector[ELEMENTS], &vetor[(myid + k) * ELEMENTS],  ELEMENTS);

						quickSort_it(localVector, 0, (ELEMENTS * 2) -1);

						myMemcpy(&vetor[myid * ELEMENTS],localVector , ELEMENTS);

						myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS],  ELEMENTS);

					}

				}

				else{ //decrescent

					if (myid % (l * 2) < l)

					{ //Process

						myMemcpy(localVector,&vetor[myid * ELEMENTS],ELEMENTS);

						myMemcpy(&localVector[ELEMENTS], &vetor[(myid + k) * ELEMENTS], ELEMENTS);

						quickSort_it(localVector, 0, (ELEMENTS * 2) -1);

						myMemcpy(&vetor[(myid + k) * ELEMENTS],localVector ,ELEMENTS);

						myMemcpy(&vetor[myid * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);

					}

				}

				__syncthreads();

			}

		}

	}

	__syncthreads();;

	//Phase 2 - Bitonic

	for (k = numprocs/2; k >= 1; k = k / 2)

	{ // Control of phase

		if (myid % (k * 2) < k)

		{ //Process

			myMemcpy(localVector,&vetor[myid * ELEMENTS],ELEMENTS);

			myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS],  ELEMENTS);

			quickSort_it(localVector, 0, (ELEMENTS * 2) -1);

			myMemcpy(&vetor[myid * ELEMENTS],localVector ,ELEMENTS);

			myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS],  ELEMENTS);

		}

		__syncthreads();

	}

}

#endif // _BITONIC_KERNEL_H_

Assuming your input is say 10m elements then what comes to my mind is a divide and conquer approach where you split this up into a lot of smaller chunks and each chunk is processed by a different block. and then a second kernel is used to merge these result together.

Todays Nvidia GPU’s have 8 to 30 multiprocessors. For fastest result you want all of those fully occupied. One ‘rule of thumb’ to overcome the latency of copying data from global to shared memory is to have each multiprocessor working on 3 blocks at once. So I would be thinking of having the ‘chunk’ size being at most 1280 elements which will allow 3 blocks to be assigned to each multiprocessor at once. The GPU would be very happy with 10,000 blocks.

If you use a ‘chunk’ size of 1024 elements (instead of the 1280 above) then 256 threads per block sounds like a nice number.

—Kernel 1—
step 1: all 256 threads used to copy a chunk of gloabl array to shared memory
e.g. Shared[threadIdx.x] = Global[ threadIdx.x + blockIdx.x * dimBlock.x] // then copy next 256 elements similarly
step 2: sort the chunk
step 3: all 256 threads used to save results back to global array. // reverse of step 1

Second kernel is trickier, will need to read part of each of several of the pre sorted chunks from global to shared and start merging them, reading in more of each chunk as you need it. Will probably need to watch the number of registers each thread uses here, also thread schronisation, and that transfers from global to shared or vice versa are coalesced (There is a useful section in CUDA programming guide about coalesced global memory access patterns.)

Hope that gives you some ideas.