why does my bubble sort get wrong?

I want to write bubble sort on cuda.

program randomly produces a int array of size N as input.

I set each block to sort subarray of size 512 ,

but program always gets wrong when when shared memory size is 512,

if I set shared memory size 1024 for each block, sometimes it whould run correctly.

Could anyone tell me how to modify my program to run fine.

Sorry for my poor english,I think it is probably easier to understand waht I mean by seeing my code.

By the way,I use GTX 460.

#include<stdio.h>

#include<stdlib.h>

#include<time.h>

#include<cuda.h>

#include <math.h>

#define N 65536

int* D_A;

int H_A[N];

int I,J,R;

void Gernerate_Matrix(int* MyArray)

{

    srand(1234);

    for (int H =0; H<N; H++)

    {

        MyArray[H]=rand()%65536;

    }

    return;

}

__global__ void GPU_Sort(int *A,int Offset)

{

    __shared__ int MyBufferA[512];   // to set 1024 sometimes would get right.

int MyTemp;

MyBufferA[threadIdx.x]=A[blockIdx.x*512+threadIdx.x+Offset];

    __syncthreads();

MyBufferA[511]=A[blockIdx.x*512+Offset+511];

    __syncthreads();

for (int T=0; T<512; T++)

    {

if (MyBufferA[threadIdx.x+T%2] < MyBufferA[threadIdx.x+1+T%2])

        {

            MyTemp=MyBufferA[threadIdx.x+T%2];

            MyBufferA[threadIdx.x+T%2]=MyBufferA[threadIdx.x+1+T%2];

            MyBufferA[threadIdx.x+1+T%2]=MyTemp;

        }

        __syncthreads();

    }

__syncthreads();

    A[blockIdx.x*512+threadIdx.x+Offset]=MyBufferA[threadIdx.x];

    __syncthreads();

A[blockIdx.x*512+Offset+511]=MyBufferA[511];

    __syncthreads();

    return;

}

int main()

{

    Gernerate_Matrix(H_A);

    cudaMalloc((void**)&D_A,N*sizeof(int));

    cudaMemcpy(D_A, H_A,N*sizeof(int) , cudaMemcpyHostToDevice);

    cudaThreadSynchronize();

    for (I=0; I<(N/512); I++)

    {

        GPU_Sort<<<N/512,511>>>(D_A,0);

        cudaThreadSynchronize();

        GPU_Sort<<<N/512-1,511>>>(D_A,256);

        cudaThreadSynchronize();

    }

    cudaMemcpy(H_A, D_A,N*sizeof(int) , cudaMemcpyDeviceToHost);

    for ( R=1; R<N; R++)

    {

        //printf("%d %d\n",R,H_A[R]);

        if (H_A[R]>H_A[R-1])

        {

            printf("%d Wrong\n",R);

            getchar();

        }

    }

    cudaFree(D_A);

    getchar();

    return 0;

}

I want to write bubble sort on cuda.

program randomly produces a int array of size N as input.

I set each block to sort subarray of size 512 ,

but program always gets wrong when when shared memory size is 512,

if I set shared memory size 1024 for each block, sometimes it whould run correctly.

Could anyone tell me how to modify my program to run fine.

Sorry for my poor english,I think it is probably easier to understand waht I mean by seeing my code.

By the way,I use GTX 460.

#include<stdio.h>

#include<stdlib.h>

#include<time.h>

#include<cuda.h>

#include <math.h>

#define N 65536

int* D_A;

int H_A[N];

int I,J,R;

void Gernerate_Matrix(int* MyArray)

{

    srand(1234);

    for (int H =0; H<N; H++)

    {

        MyArray[H]=rand()%65536;

    }

    return;

}

__global__ void GPU_Sort(int *A,int Offset)

{

    __shared__ int MyBufferA[512];   // to set 1024 sometimes would get right.

int MyTemp;

MyBufferA[threadIdx.x]=A[blockIdx.x*512+threadIdx.x+Offset];

    __syncthreads();

MyBufferA[511]=A[blockIdx.x*512+Offset+511];

    __syncthreads();

for (int T=0; T<512; T++)

    {

if (MyBufferA[threadIdx.x+T%2] < MyBufferA[threadIdx.x+1+T%2])

        {

            MyTemp=MyBufferA[threadIdx.x+T%2];

            MyBufferA[threadIdx.x+T%2]=MyBufferA[threadIdx.x+1+T%2];

            MyBufferA[threadIdx.x+1+T%2]=MyTemp;

        }

        __syncthreads();

    }

__syncthreads();

    A[blockIdx.x*512+threadIdx.x+Offset]=MyBufferA[threadIdx.x];

    __syncthreads();

A[blockIdx.x*512+Offset+511]=MyBufferA[511];

    __syncthreads();

    return;

}

int main()

{

    Gernerate_Matrix(H_A);

    cudaMalloc((void**)&D_A,N*sizeof(int));

    cudaMemcpy(D_A, H_A,N*sizeof(int) , cudaMemcpyHostToDevice);

    cudaThreadSynchronize();

    for (I=0; I<(N/512); I++)

    {

        GPU_Sort<<<N/512,511>>>(D_A,0);

        cudaThreadSynchronize();

        GPU_Sort<<<N/512-1,511>>>(D_A,256);

        cudaThreadSynchronize();

    }

    cudaMemcpy(H_A, D_A,N*sizeof(int) , cudaMemcpyDeviceToHost);

    for ( R=1; R<N; R++)

    {

        //printf("%d %d\n",R,H_A[R]);

        if (H_A[R]>H_A[R-1])

        {

            printf("%d Wrong\n",R);

            getchar();

        }

    }

    cudaFree(D_A);

    getchar();

    return 0;

}

I find myself making a stupid mistake on my thought. :haha:

Progam runs fine now, thanks for your attention.

following is correct version.

#include<stdio.h>

#include<stdlib.h>

#include<time.h>

#include<cuda.h>

#include <math.h>

#define N 65536

int* D_A;

int H_A[N];

int I,J,R;

void Gernerate_Matrix(int* MyArray)

{

    srand(time(NULL));

    for (int H =0; H<N; H++)

    {

        MyArray[H]=rand()%65536;

    }

    return;

}

__global__ void GPU_Sort(int *A,int Offset)

{

    __shared__ int MyBufferA[512];

    int MyTemp;

    MyBufferA[threadIdx.x]=A[blockIdx.x*512+Offset+threadIdx.x];

    __syncthreads();

    MyBufferA[threadIdx.x+256]=A[blockIdx.x*512+Offset+threadIdx.x+256];

    __syncthreads();

    for (int T=0; T<512; T++)

    {

        if (2*threadIdx.x+1+T%2<512)

        {

            if (MyBufferA[2*threadIdx.x+T%2] < MyBufferA[2*threadIdx.x+1+T%2])

            {

                MyTemp=MyBufferA[2*threadIdx.x+T%2];

                MyBufferA[2*threadIdx.x+T%2]=MyBufferA[2*threadIdx.x+1+T%2];

                MyBufferA[2*threadIdx.x+1+T%2]=MyTemp;

            }

        }

        __syncthreads();

    }

    A[blockIdx.x*512+threadIdx.x+Offset]=MyBufferA[threadIdx.x];

    __syncthreads();

    A[blockIdx.x*512+Offset+threadIdx.x+256]=MyBufferA[threadIdx.x+256];

    __syncthreads();

    return;

}

int main()

{

    Gernerate_Matrix(H_A);

    cudaMalloc((void**)&D_A,N*sizeof(int));

    cudaMemcpy(D_A, H_A,N*sizeof(int) , cudaMemcpyHostToDevice);

    cudaThreadSynchronize();

    for (I=0; I<(N/512); I++)

    {

        GPU_Sort<<<N/512,256>>>(D_A,0);

        cudaThreadSynchronize();

        GPU_Sort<<<N/512-1,256>>>(D_A,256);

        cudaThreadSynchronize();

    }

    cudaThreadSynchronize();

    cudaMemcpy(H_A, D_A,N*sizeof(int) , cudaMemcpyDeviceToHost);

    for ( R=1; R<N; R++)

    {

        printf("%d %d\n",R,H_A[R]);

        if (H_A[R]>H_A[R-1])

        {

            printf("%d Wrong\n",R);

            getchar();

        }

    }

    cudaFree(D_A);

    getchar();

    return 0;

}

I find myself making a stupid mistake on my thought. :haha:

Progam runs fine now, thanks for your attention.

following is correct version.

#include<stdio.h>

#include<stdlib.h>

#include<time.h>

#include<cuda.h>

#include <math.h>

#define N 65536

int* D_A;

int H_A[N];

int I,J,R;

void Gernerate_Matrix(int* MyArray)

{

    srand(time(NULL));

    for (int H =0; H<N; H++)

    {

        MyArray[H]=rand()%65536;

    }

    return;

}

__global__ void GPU_Sort(int *A,int Offset)

{

    __shared__ int MyBufferA[512];

    int MyTemp;

    MyBufferA[threadIdx.x]=A[blockIdx.x*512+Offset+threadIdx.x];

    __syncthreads();

    MyBufferA[threadIdx.x+256]=A[blockIdx.x*512+Offset+threadIdx.x+256];

    __syncthreads();

    for (int T=0; T<512; T++)

    {

        if (2*threadIdx.x+1+T%2<512)

        {

            if (MyBufferA[2*threadIdx.x+T%2] < MyBufferA[2*threadIdx.x+1+T%2])

            {

                MyTemp=MyBufferA[2*threadIdx.x+T%2];

                MyBufferA[2*threadIdx.x+T%2]=MyBufferA[2*threadIdx.x+1+T%2];

                MyBufferA[2*threadIdx.x+1+T%2]=MyTemp;

            }

        }

        __syncthreads();

    }

    A[blockIdx.x*512+threadIdx.x+Offset]=MyBufferA[threadIdx.x];

    __syncthreads();

    A[blockIdx.x*512+Offset+threadIdx.x+256]=MyBufferA[threadIdx.x+256];

    __syncthreads();

    return;

}

int main()

{

    Gernerate_Matrix(H_A);

    cudaMalloc((void**)&D_A,N*sizeof(int));

    cudaMemcpy(D_A, H_A,N*sizeof(int) , cudaMemcpyHostToDevice);

    cudaThreadSynchronize();

    for (I=0; I<(N/512); I++)

    {

        GPU_Sort<<<N/512,256>>>(D_A,0);

        cudaThreadSynchronize();

        GPU_Sort<<<N/512-1,256>>>(D_A,256);

        cudaThreadSynchronize();

    }

    cudaThreadSynchronize();

    cudaMemcpy(H_A, D_A,N*sizeof(int) , cudaMemcpyDeviceToHost);

    for ( R=1; R<N; R++)

    {

        printf("%d %d\n",R,H_A[R]);

        if (H_A[R]>H_A[R-1])

        {

            printf("%d Wrong\n",R);

            getchar();

        }

    }

    cudaFree(D_A);

    getchar();

    return 0;

}