bitonic sotr

Hi, i have some problem with my code.

my int tabel = 3,5,8,9,7,4,2,1

and after sorting my table value are 9 8 3 5 4 7 2 1. but i expect 1,2,3,4,5,7,8,9.

Where i make mistake.

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

__global__ static void bitonicSort(int * values, int NUM)

{

    extern __shared__ int shared[];

 const unsigned int tid = threadIdx.x;

 shared[tid] = values[tid];

 __syncthreads();

for (unsigned int k = 2; k <= 5; k *= 2){

 	for (unsigned int j = k / 2; j>0; j /= 2){

            unsigned int ixj = tid ^ j;

 		if (ixj > tid){

                if ((tid & k) == 0)

			 if (shared[tid] > shared[ixj])

                         {

				int temp=shared[tid];

				shared[tid]=shared[ixj];

				shared[ixj]=temp;

			}

		else

                     if (shared[tid]< shared[ixj]){

 			int temp=shared[tid];

			shared[tid]=shared[ixj];

			shared[ixj]=temp;

			}

		}

           __syncthreads();

	}

}

// Write result.

 values[tid] = shared[tid];

}

int main(void)

{ 

int NUM = 8;

int  values[] = {3,5,8,9,7,4,2,1};

 int * dvalues;

cudaMalloc((void**)&dvalues, sizeof(int) * NUM);

cudaMemcpy(dvalues, values, sizeof(int) * NUM,cudaMemcpyHostToDevice);

for(int i=0;i<8;i++){printf("%d ",values[i]);}printf("\n");

bitonicSort<<<1, NUM, sizeof(int) * NUM>>>(dvalues, NUM);

cudaMemcpy(values, dvalues, sizeof(int) * NUM,  cudaMemcpyDeviceToHost);

cudaFree(dvalues);

for(int i=0;i<8;i++){printf("%d ",values[i]);}

}