Hi people,
I have a problem when I try to create a integer array with size greater than 4096.
I Know that the max size of local and shared memory in devices functions is 16KB, but I have no idea how to solve this.
I’m doing a comparison between CUDA, OpenMP, Pthreads and MPI using bitonic sort algorithm.
For now I can only use a block with four threads in CUDA.
I already implement the source in others plataforms.
Below my kernel source.
Thanks in advance for sugestion how to solve this.
#ifndef _BITONIC_KERNEL_CU_
#define _BITONIC_KERNEL_CU_
// Define the numbers of elements which be sorted
#define N_16384 1
#ifdef N_524288
#define WIDTH 524288 //2^19
#define ELEMENTS 131072
#elif N_262144
#define WIDTH 262144 //2^18
#define ELEMENTS 65536
#elif N_131072
#define WIDTH 131072 //2^17
#define ELEMENTS 32768
#elif N_65536
#define WIDTH 65536 //2^16
#define ELEMENTS 16384
#elif N_32768
#define WIDTH 32768 //2^15
#define ELEMENTS 8192
#elif N_16384
#define WIDTH 16384 //2^14
#define ELEMENTS 4096
#elif N_8192
#define WIDTH 8192 //2^13
#define ELEMENTS 2048
#elif N_4096
#define WIDTH 4096 //2^12
#define ELEMENTS 1024
#elif N_2048
#define WIDTH 2048 //2^11
#define ELEMENTS 512
#elif N_1024
#define WIDTH 1024 //2^10
#define ELEMENTS 256
#endif
//-----------------------------------------------------
// DEVICE FUNCTIONS
//----------------------------------------------------
__device__ inline void swap(int & a, int & b)
{
// Alternative swap doesn't use a temporary register:
a ^= b;
b ^= a;
a ^= b;
}
/**
* Alternative memcpy
*/
__device__ void myMemcpy(int *dst, int *src, int size) {
int i;
for(i=0;i<size; i++) {
swap(dst[i], src[i]);
}
}
__device__ inline void insertionSort(int *a, int p, int r) {
int i;
for (i = p+1; i <= r; i++) {
int j = i, v = a[i];
while (p <= j-1 && v < a[j-1]) {
a[j] = a[j-1];
j--;
}
a[j] = v;
}
}
__device__ inline void quickSort_it (int v[], int p, int r)
{
insertionSort(v, p, r);
}
__global__ static void bitonicSort(int * vetor)
{
int k, l, numprocs, myid;
numprocs = (WIDTH / ELEMENTS);
myid = threadIdx.x;
int localVector[ELEMENTS * 2]; // ERROR OCCURS HERE
// whatever if I declare
// __shared__ int localVector[ELEMENTS * 2];
quickSort_it(vetor, (myid * ELEMENTS), (myid * ELEMENTS) + ELEMENTS -1);
// Deranged - Phase 1
for (k = 1; k < numprocs / 2; k = k * 2)
{//Control of phase 1
if ((myid % (k * 4)) < (k * 2))
{//SORT crescent
if (myid % (k * 2) < k)
{ //Process
myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);
myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS], ELEMENTS);
quickSort_it(localVector, 0,(ELEMENTS *2)-1);
myMemcpy(&vetor[myid * ELEMENTS],localVector ,ELEMENTS);
myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);
}
}
else
{ //Sort decrescent
if (myid % (k * 2) < k)
{ //Process
myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);
myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS], ELEMENTS);
quickSort_it(localVector, 0,(ELEMENTS *2)-1);
myMemcpy(&vetor[(myid + k) * ELEMENTS],localVector , ELEMENTS);
myMemcpy(&vetor[myid * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);
}
}
__syncthreads();
if (k > 1)
{ //Adjust values
for (l = k / 2; l >= 1; l = l / 2)
{
if (myid % (k * 4) < (k * 2))
{ //crescent
if (myid % (l * 2) < l)
{ //Process
myMemcpy(localVector,&vetor[myid * ELEMENTS], ELEMENTS);
myMemcpy(&localVector[ELEMENTS], &vetor[(myid + k) * ELEMENTS], ELEMENTS);
quickSort_it(localVector, 0, (ELEMENTS * 2) -1);
myMemcpy(&vetor[myid * ELEMENTS],localVector , ELEMENTS);
myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);
}
}
else{ //decrescent
if (myid % (l * 2) < l)
{ //Process
myMemcpy(localVector,&vetor[myid * ELEMENTS],ELEMENTS);
myMemcpy(&localVector[ELEMENTS], &vetor[(myid + k) * ELEMENTS], ELEMENTS);
quickSort_it(localVector, 0, (ELEMENTS * 2) -1);
myMemcpy(&vetor[(myid + k) * ELEMENTS],localVector ,ELEMENTS);
myMemcpy(&vetor[myid * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);
}
}
__syncthreads();
}
}
}
__syncthreads();;
//Phase 2 - Bitonic
for (k = numprocs/2; k >= 1; k = k / 2)
{ // Control of phase
if (myid % (k * 2) < k)
{ //Process
myMemcpy(localVector,&vetor[myid * ELEMENTS],ELEMENTS);
myMemcpy(&localVector[ELEMENTS],&vetor[(myid + k) * ELEMENTS], ELEMENTS);
quickSort_it(localVector, 0, (ELEMENTS * 2) -1);
myMemcpy(&vetor[myid * ELEMENTS],localVector ,ELEMENTS);
myMemcpy(&vetor[(myid + k) * ELEMENTS], &localVector[ELEMENTS], ELEMENTS);
}
__syncthreads();
}
}
#endif // _BITONIC_KERNEL_H_