I made a huge mistake. At first I thought I was getting correct results. Here is my output:
Memory Allocation: SUCCESSMemory Allocation: SUCCESSHere is bubblesorted array:
31 31 40 41 60 66 68 75 161 164 168 172 275 277 277 280 281 281 283 284 285 287 288 288 295 295 299 300 303 303 303 309
Here is random array:
281 285 277 275 68 75 66 60 280 288 288 284 164 172 168 161 299 300 295 295 303 309 303 303 283 287 281 277 31 41 40 31 Memory Copy: SUCCESSMemory Copy: SUCCESSHere is sequence size: 4
Here is PowerOf2: 1
Here is Blocks: 8
Here is Threads: 2
Here is SHARED: 4
Total execution time for kernel: 0.105984
Invoke Kernel: SUCCESS
Here is an attempt to bitonically sort array:
275 277 281 285 68 75 66 60 280 288 288 284 164 172 168 161 299 300 295 295 303 309 303 303 283 287 281 277 31 41 40 31
Here is the sorted array
31 31 40 41 60 66 68 75 161 164 168 172 275 277 277 280 281 281 283 284 285 287 288 288 295 295 299 300 303 303 303 309
The array is not sorted correctly.
Free Memory: SUCCESSFree Memory: SUCCESSmatthew@matthew-desktop:
Look under “Here is an attempt to bitonically sort array” at the first 4 numbers - those are sorted correctly. The rest aren’t and each block handles a bitonic sequence of 4 numbers. In fact, the rest of them are gathered from global memory without even sorting at all. I’d have been glad if they had garbage numbers, but doing nothing?!
This proves I’m only using one block when I indicated to kernel 8 blocks.
Here’s further proof of this behavior based on the code I provided in the link:
Memory Allocation: SUCCESSMemory Allocation: SUCCESSHere is bubblesorted array:
4 11 14 17 41 47 53 55 92 94 94 95 96 96 101 103 141 147 147 157 161 168 168 175 189 193 196 200 282 288 293 295
Here is random array:
14 17 11 4 196 200 193 189 168 175 168 161 53 55 47 41 92 101 96 94 293 295 288 282 94 103 96 95 147 157 147 141 Memory Copy: SUCCESSMemory Copy: SUCCESSHere is sequence size: 4
Here is PowerOf2: 1
Here is Blocks: 8
Here is Threads: 2 -Maximum number of threads per block
Here is SHARED: 4
Total execution time for kernel: 0.036128
Invoke Kernel: SUCCESS
Here is an attempt to bitonically sort array:
1 1 0 0 196 200 193 189 168 175 168 161 53 55 47 41 92 101 96 94 293 295 288 282 94 103 96 95 147 157 147 141
Here is the sorted array
4 11 14 17 41 47 53 55 92 94 94 95 96 96 101 103 141 147 147 157 161 168 168 175 189 193 196 200 282 288 293 295
The array is not sorted correctly.
Again the same behavior with these changes.
/*The following code is an experiment to see if Bitonic Sort can be done without recursion*/
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <cuda.h>
#include <string.h>
/*For GTX460 using occupancy calculator*/
#define N 32
#define MAX_ACTIVE_BLOCK 7
#define MAX_THREAD_PER_BLOCK 512
#define SHARED 64
void initBArray(int *r, int SeqSize); //Fill N array with bitonic sequences
void printArray(int BSortedArray[]);
void errorCheck(cudaError_t ErrorFlag, const char *msg);
void copyArray(int *NArray, int *CopyArray);
bool sortCheck(int *NArray, int *NArrayCopy); //Compare arrays for accuracy
void bubbleSort(int *NArrayCopy);
bool isPowerOf2(); //For future implementation
__global__ void bitonicSort(int *NArray,int *BarrierVector, int SeqSize, int PowerOf2, int Blocks, int Threads) {
register int AllThread = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int sh[SHARED];
sh[(blockIdx.x * Threads)+threadIdx.x] = NArray[AllThread];
sh[(blockIdx.x * Threads)+threadIdx.x+Threads] = NArray[AllThread + Threads];
sh[threadIdx.x] = 1;
sh[threadIdx.x + Threads] = 0;
NArray[AllThread] = sh[(blockIdx.x * Threads)+threadIdx.x];
NArray[Threads + AllThread] = sh[(blockIdx.x * Threads) + threadIdx.x + Threads];
}
int main() {
/*General declarations, summations, and initialization*/
int NArray[N] = {0}; //random number array
int NArrayCopy[N] = {0}; //copy of array to be checked for correct sorting
int Blocks; //Temp value for bug checking - it's supposed to be N / BATCHSIZE
int Threads;
int PowerOf2;
int SeqSize;
int MaxSharedPerBlock;
int SharedPerBlock = 1;
int Limit = N / MAX_ACTIVE_BLOCK;
float time1;
bool IsCorrect;
MaxSharedPerBlock = ( ( 16 / MAX_ACTIVE_BLOCK ) * 1000) * 2; //max integers in shared per active block
if (Limit < MaxSharedPerBlock) {
while ((SharedPerBlock < Limit) && (SharedPerBlock <= MAX_THREAD_PER_BLOCK)) {
SharedPerBlock = SharedPerBlock * 2;
}
#undef SHARED
#define SHARED SharedPerBlock
Blocks = N / SharedPerBlock;
Threads = SharedPerBlock / 2;
SeqSize = SharedPerBlock;
PowerOf2 = 1;
}
else {
PowerOf2 = 0;
Blocks = N / MAX_THREAD_PER_BLOCK;
Threads = MAX_THREAD_PER_BLOCK / 2;
}
int BarrierVector[Blocks];
dim3 BlockGrid(Blocks,1,1);
dim3 BlockDim(Threads,1,1);
cudaEvent_t start;
cudaEvent_t stop;
cudaError_t Error;
srand(time(0));
//Initialize memory sizes for host and device
size_t ArMemSize;
size_t BlMemSize;
int* HostNArray; //memory for fully sorted array
int* DevNArray; //memory for full array for device
int* DevBarrierVector; //memory for barrier vector for device
//Get and allocate memory size for host
ArMemSize = N * sizeof(int);
BlMemSize = Blocks * sizeof(int);
HostNArray = (int*)malloc(ArMemSize);
//Allocate memory size for device
Error = cudaMalloc((void**)&DevNArray, ArMemSize);
errorCheck(Error, "Memory Allocation: ");
Error = cudaMalloc((void**)&DevBarrierVector, BlMemSize);
errorCheck(Error, "Memory Allocation: ");
initBArray(NArray, SeqSize);
copyArray(NArray, NArrayCopy);
bubbleSort(NArrayCopy);
printf("Here is bubblesorted array: \n");
printArray(NArrayCopy);
printf("\nHere is random array: \n");
printArray(NArray);
//Fill barrier array
for (int counter = 0; counter < Blocks; counter++)
BarrierVector[counter] = 0;
//Copy memory from host to device
Error = cudaMemcpy(DevNArray, (void*)NArray, ArMemSize, cudaMemcpyHostToDevice);
errorCheck(Error, "Memory Copy: ");
Error = cudaMemcpy(DevBarrierVector, BarrierVector, BlMemSize, cudaMemcpyHostToDevice);
errorCheck(Error, "Memory Copy: ");
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
/*Value Check*/
printf("Here is sequence size: %d\n", SeqSize);
printf("Here is PowerOf2: %d\n", PowerOf2);
printf("Here is Blocks: %d\n", Blocks);
printf("Here is Threads: %d\n", Threads);
printf("Here is SHARED: %d\n", SHARED);
bitonicSort<<<BlockGrid,BlockDim>>>(DevNArray, DevBarrierVector, SeqSize, PowerOf2, Blocks, Threads);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time1, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("\n");
printf("Total execution time for kernel: %f\n", time1);
//Copy sorted list from device to host
cudaMemcpy(HostNArray, DevNArray, ArMemSize, cudaMemcpyDeviceToHost);
Error = cudaGetLastError();
errorCheck(Error, "Invoke Kernel: ");
printf("\nHere is an attempt to bitonically sort array: \n");
printArray(HostNArray);
printf("\nHere is the sorted array\n");
printArray(NArrayCopy);
IsCorrect = sortCheck(HostNArray, NArrayCopy);
if (IsCorrect) {
printf("\nThe array is correctly sorted.\n");
}
else {
printf("\nThe array is not sorted correctly.\n");
}
//Free memory used
Error = cudaFree(DevNArray);
errorCheck(Error, "Free Memory: ");
Error = cudaFree(DevBarrierVector);
errorCheck(Error, "Free Memory: ");
free(HostNArray);
}
//Pre-Conditions: receives an empty integer array of maximum size N
//Post-Conditions: fills array with bitonic sequences up to N
void initBArray(int *r, int SeqSize) {
volatile int LocalMax;
volatile int LowerLimit = 0;
volatile int UpperLimit = SeqSize;
volatile int MiddleOfSeq = SeqSize/2 - 1;
volatile int Max = N-1;
volatile int AfterMidSeq = SeqSize/2;
volatile int Previous;
//Get a local maximum
LocalMax = N * 10;
while (AfterMidSeq <= N) {
for (int count = MiddleOfSeq; count >= LowerLimit; count--) {
if (count == MiddleOfSeq) {
r[count] = rand() % LocalMax;
}
else {
Previous = r[count + 1];
r[count] = Previous - (rand() % 11);
}
}
for (int count = AfterMidSeq; count < UpperLimit; count++) {
Previous = r[count-1];
r[count] = Previous - (rand() % 11);
}
AfterMidSeq = AfterMidSeq + SeqSize;
MiddleOfSeq = MiddleOfSeq + SeqSize;
LowerLimit = LowerLimit + SeqSize;
UpperLimit = UpperLimit + SeqSize;
}
}
//Pre-Conditions: receives an integer bitonically sorted array of maximum size N
//Post-Conditions: displays the contents of the bitonically sorted array
void printArray(int BSortedArray[]) {
int count;
for (count = 0; count < N; count++)
printf("%d ", BSortedArray[count]);
}
//Pre-Conditions: receives an error message of type cudaError_t and a message in a string literal form from where the cuda error could have occured in program.
//Post-Conditions: displays an error for the cuda operation if there is one or a success message if there isn't an error.
void errorCheck(cudaError_t ErrorFlag, const char *msg) {
if (ErrorFlag != cudaSuccess) {
fprintf(stderr, "Cuda: error %s %s\n", msg, cudaGetErrorString(ErrorFlag));
exit(-1);
}
else
printf("%s SUCCESS", msg);
}
void bubbleSort(int *NArray) {
volatile int Count;
volatile int Pass = N-1;
volatile int InnerCount;
volatile int HighValueIndex;
volatile int Temp;
volatile int InnerPass;
for (Count = Pass; Count >= 0 ; Count--) {
HighValueIndex = 0;
for (InnerPass = 1; InnerPass <= Count; InnerPass++) {
if (NArray[HighValueIndex] < NArray[InnerPass]) {
HighValueIndex = InnerPass;
}
}
//Swap
Temp = NArray[HighValueIndex];
NArray[HighValueIndex] = NArray[Count];
NArray[Count] = Temp;
}
}
bool sortCheck(int *NArray, int *NArrayCopy) {
bool IsCorrectlySorted = true;
int count;
for (count = 0; count < N; count++) {
if (NArray[count] != NArrayCopy[count]) {
IsCorrectlySorted = false;
}
}
return IsCorrectlySorted;
}
bool isPowerOf2() {
int P2increment = 1;
bool PowerOf2 = false;
while (P2increment <= N) {
if (P2increment == N) {
PowerOf2 = true;
//Exit loop
P2increment = P2increment + (N + 1);
}
P2increment = P2increment * 2;
}
return PowerOf2;
}
void copyArray(int *NArray, int *CopyArray) {
int count;
for (count = 0; count < N; count++) {
CopyArray[count] = NArray[count];
}
}