ThreadId are different in Debug vs Release Using Bitonic sort on GTX8800.

All,

I am still on my learning curve with CUDA, but i am facing a weird issue where by the threadIdx.x is behaving differently under debug and release.

The results from the bitonic sort project were different under these 2 configuration so after a liltle debugging I modified the bitonic sort project to return the sort index.

In the below code, the first value of the ‘values’ array is junk in Release mode, and rest of the array seems to have been offset by 1

When observing the values in the ‘index’ array, you will see that index[0] is junk too.

I initially thought that the sort algorithm may be going out of bounds but upon commenting the sorting code also index[0] contains junk.

host code

int main(int argc, char** argv)

{

    CUT_DEVICE_INIT();

   float values[NUM];

	int index[NUM];

   for(int i = 0; i < NUM; i++)

    {

        values[i] = rand();

    }

   float * dvalues;

	int * dindex;

    CUDA_SAFE_CALL(cudaMalloc((void**)&dvalues, sizeof(float) * NUM));

    CUDA_SAFE_CALL(cudaMemcpy(dvalues, values, sizeof(float) * NUM, cudaMemcpyHostToDevice));

	

	CUDA_SAFE_CALL(cudaMalloc((void**)&dindex, sizeof(int) * NUM));

    CUDA_SAFE_CALL(cudaMemset(dindex, 0, sizeof(int) * NUM));

   bitonicSort<<<1, NUM, sizeof(float) * NUM>>>(dvalues, dindex);

   // check for any errors

    CUT_CHECK_ERROR("Kernel execution failed");

   CUDA_SAFE_CALL(cudaMemcpy(values, dvalues, sizeof(float) * NUM, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(index, dindex, sizeof(int) * NUM, cudaMemcpyDeviceToHost));

   CUDA_SAFE_CALL(cudaFree(dvalues));

  CUDA_SAFE_CALL(cudaFree(dindex));

   bool passed = true;

    for(int i = 1; i < NUM; i++)

    {

        if (values[i-1] > values[i])

        {

            passed = false;

        }

    }

   printf( "Test %s\n", passed ? "PASSED" : "FAILED");

   CUT_EXIT(argc, argv);

}

device code

#ifndef _BITONIC_KERNEL_H_

#define _BITONIC_KERNEL_H_

#define NUM    256

__device__ inline void swapFloat(float & a, float & b)

{

	// Alternative swap doesn't use a temporary register:

	// a ^= b;

	// b ^= a;

	// a ^= b;

	

    float tmp = a;

    a = b;

    b = tmp;

}

__device__ inline void swap(int & a, int & b)

{

	// Alternative swap doesn't use a temporary register:

	// a ^= b;

	// b ^= a;

	// a ^= b;

	

    int tmp = a;

    a = b;

    b = tmp;

}

__global__ static void bitonicSort(float * values, int* valueIndex)

{

    extern __shared__ float shared[];

	__shared__ int index[NUM];

    const int tid = threadIdx.x;

   // Copy input to shared mem.

    shared[tid] = values[tid];

	index[tid] = tid;

    __syncthreads();

   // Parallel bitonic sort.

    for (int k = 2; k <= NUM; k *= 2)

    {

        // Bitonic merge:

        for (int j = k / 2; j>0; j /= 2)

        {

            int ixj = tid ^ j;

            

            if (ixj > tid)

            {

                if ((tid & k) == 0)

                {

                    if (shared[tid] > shared[ixj])

                    {

                        swapFloat(shared[tid], shared[ixj]);

      swap(index[tid], index[ixj]);

                    }

                }

                else

                {

                    if (shared[tid] < shared[ixj])

                    {

                        swapFloat(shared[tid], shared[ixj]);

      swap(index[tid], index[ixj]);

                    }

                }

            }

            

            __syncthreads();

        }

    }

   // Write result.

    values[tid] = shared[tid];

	valueIndex[tid] = index[tid];

}

#endif // _BITONIC_KERNEL_H_

What is exact value of NUM? And how many registers and smem is requred for your kenrel (copy & paste first few lines of .cubin file).

It is possible that your kernel just fails to launch because thread block is too big…

I think i found the problem … looks like the problem is with Visual Studio. For some reason the debugger shows incorrect array value, but when u do a printf the right results are found.