Why does these data have no change after being processed by gpu?

I take some practice in middle of reading the book CUDA Programming A Developer’s Guide To Parallel Computing with GPUs. On the chapter 6-Memory Handling with CUDA of this book, it implements radix sort algorithm with multithreads based on GPU. I implement this algorithm as blow. But after executing the kernel:

gpu_sort<<<num_block_per_grid,num_threads_per_block>>>(gpu_array,NUM_THREAD,NUM_ELEM);

The order of cpu_array that takes data from gpu_array has no change(i.e.The order of output data is the same as the order of input data). These functions gpu_sort(),copy_data_to_shared(),radix_sort(),merge_array() are almost copied from the book, thus the problem may be not there. The problem possibly be within main(void). The OS and GPU are Ubuntu 20.04 and 3080Ti respectively.
Any idea would be helpful.

#include<bits/stdc++.h>
using namespace std;
#define NUM_ELEM 4096
#define NUM_THREAD 32
typedef unsigned int u32;

__device__ void copy_data_to_shared(const u32 *const data,u32 *const sort_tmp,const u32 num_lists,const u32 num_elements,const u32 tid);
__device__ void radix_sort(u32 *const sort_tmp,const u32 num_lists,const u32 num_elements,const u32 tid,u32 *const sort_tmp_1);
__device__ void merge_array(const u32*const src_arry,u32 *const dest_array,const u32 num_lists,const u32 num_elements,const u32 tid);
//num_lists= the total number of threads provoked to execute the algorithm
__global__ void gpu_sort(u32 *const data,const u32 num_lists,const u32 num_elements){
    const u32 tid=blockIdx.x*blockDim.x+threadIdx.x;
    __shared__ u32 sort_tmp[NUM_ELEM];
    __shared__ u32 sort_tmp_1[NUM_ELEM];
    copy_data_to_shared(data,sort_tmp,num_lists,num_elements,tid);
    radix_sort(sort_tmp,num_lists,num_elements,tid,sort_tmp_1);
    merge_array(sort_tmp,data,num_lists,num_elements,tid);
}

__device__ void copy_data_to_shared(
    const u32 *const data,
    u32 *const sort_tmp,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid){
        for(u32 i=0;i<num_elements;i+=num_lists)
            sort_tmp[i+tid]=data[i+tid];
        __syncthreads();
}

__device__ void radix_sort(
    u32 *const sort_tmp,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid,
    u32 *const sort_tmp_1){
        for(u32 bit=0;bit<32;bit++){
            const u32 bit_mask=(1<<bit);
            u32 base_cnt_0=0;
            u32 base_cnt_1=0;
            for(u32 i=0;i<num_elements;i+=num_lists){
                const u32 elem=sort_tmp[i+tid];
                if((elem&bit_mask)>0){
                    sort_tmp_1[base_cnt_1+tid]=elem;
                    base_cnt_1+=num_lists;
                }else{
                    sort_tmp[base_cnt_0+tid]=elem;
                    base_cnt_0+=num_lists;
                }
            }
            for(u32 i=0;i<base_cnt_1;i+=num_lists)
                sort_tmp[base_cnt_0+i+tid]=sort_tmp_1[i+tid];
        }
        __syncthreads();
}

__device__ void merge_array(
    const u32*const src_arry,
    u32 *const dest_array,
    const u32 num_lists,
    const u32 num_elements,
    const u32 tid){
        const u32 num_elements_per_list=(num_elements/num_lists);
        __shared__ u32 list_indexes[NUM_THREAD];
        list_indexes[tid]=0;
        __syncthreads();
        for(u32 i=0;i<num_elements;i++){
            __shared__ u32 min_val;
            __shared__ u32 min_tid;
            u32 data;
            if(list_indexes[tid]<num_elements_per_list){
                const u32 src_idx=tid+(list_indexes[tid]*num_lists);
                data=src_arry[src_idx];
            }else
                data=0xFFFFFFFF;
            if(tid==0){
                min_val=0xFFFFFFFF;
                min_tid=0xFFFFFFFF;
            }
            __syncthreads();
            atomicMin(&min_val,data);
            __syncthreads;
            if(min_val==data){
                atomicMin(&min_tid,tid);
            }
            __syncthreads();
            if(tid==min_tid){
                list_indexes[tid]++;
                dest_array[i]=data;
            }
        }
}



int main(void){
    char* file_name="data.txt";
    auto mode=ifstream::in;
    fstream file_stream(file_name,mode);
    vector<u32> vec_data;
    u32 value;
    while(file_stream>>value){
        vec_data.push_back(value);
    }
    u32 cpu_array[NUM_ELEM];
    u32 gpu_array[NUM_ELEM];
    for(u32 i=0;i<NUM_ELEM;i++)
        cpu_array[i]=vec_data[i];
    cudaMalloc((void**)(&gpu_array),NUM_ELEM*sizeof(u32));
    cudaMemcpy(gpu_array,cpu_array,NUM_ELEM*sizeof(u32),cudaMemcpyHostToDevice);
    dim3 num_threads_per_block(256,1);
    dim3 num_blocks_per_grid(16,1);
    gpu_sort<<<num_blocks_per_grid,num_threads_per_block>>>(gpu_array,NUM_THREAD,NUM_ELEM);
    u32 clock=100000;
    while(clock--){}
    cudaMemcpy(cpu_array,gpu_array,NUM_ELEM*sizeof(u32),cudaMemcpyDeviceToHost);
    cudaFree(gpu_array);
    sort(vec_data.begin(),vec_data.end());
    bool flag=true;
    for(u32 i=0;i<NUM_ELEM;i++)
        if(vec_data[i]!=cpu_array[i])
            flag=false;
    cout << "correct data order" <<endl;
    for(u32 i=0;i<NUM_ELEM;i++)
        cout << vec_data[i]<<" ";
    cout << endl;
    cout << "result data order" <<endl;
    for(u32 i=0;i<NUM_ELEM;i++)
        cout << cpu_array[i]<<" ";
    cout << endl;
    if(flag)
        cout << "YES" <<endl;
    else 
        cout << "NO" <<endl;
    return 0;
}

What is this supposed to do?

I get several warnings when I compile the code. Are you seeing those as well? If so, look at them closely and fix the issues the compiler is complaining about.

Oh, it is a wrong statement. I will fix it and check the whole program again. Thank you so much.

@njuffa I have revised the statement as below. But it still doesn’t work(i.e. the array of order doesn’t change).

for(u32 i=0;i<base_cnt_1;i+=num_lists)
   sort_tmp[base_cnt_0+i+tid]=sort_tmp_1[i+tid];