The emulator result is not the same as CUDA ?

Hi, all. I am a new hand in using CUDA programming.

I have a problem when I write my program.

I want to get the match data from the Input Data and put these data into list.

I use the “emulator” to debug my program and the result that I want is correct.

I use the CUDA to run the same program but the result is not correct.

Anyone can tell me why the results are not the same ?

If I want to get the correct result from the CUDA, what should I do ?

Thanks~

This is the kernel function

[codebox]

// =======================================================

// Kernel function:

// @d_InputData : Input Data

// @NumOfData : Number of Input Data

// @d_flag : flag that is used to lock the “critical section”

// @d_list : Put the match data in this list

// @d_list : list index

// @d_NumOfMatchData : Number of match data that is match

// =========================================================

global void

kernel_function( float *d_InputData,

                       int    NumOfData,                             

                       int    *d_flag,                                    

                       float  *d_list,                                    

                       int    *d_listIdx,                                 

                       int    *d_NumOfMatchData )                    

{

Back:

int ThreadIdx = threadIdx.x;

// -----------------------------------------------------------

// lock by using d_flag[0]

// -----------------------------------------------------------

atomicCAS( &d_flag[0], 0, 1 );

if( atomicCAS( &d_flag[0], 1, 0 ) != 1 ){

    goto Back;

}

if( d_InputData[ThreadIdx] >= 0.0 ){

    atomicAdd( &d_NumOfMatchData[0], 1 );

    d_list[ d_listIdx[0] ] = d_InputData[ThreadIdx];

    atomicAdd( &d_listIdx[0], 1 );

}

__syncthreads();

}

[/codebox]

This is the main …

[codebox]

// ======================================================

// Main

// =====================================================

int main( int argc, char **argv )

{

cudaError_t CudaRet;

dim3 dimThread, dimBlock;

int i = 0;

int NumOfData = 512;

float *h_InputData = NULL,

      *d_InputData = NULL,

      *d_list        = NULL,

      *h_list        = NULL;

int *d_flag = NULL,

     *d_listIdx        = NULL,

     *d_NumOfMatchData = NULL,

      h_flag            = 0,

      h_listIdx         = 0,

      h_NumOfMatchData  = 0;

// ----------------------------------------------------------------

// Set Number Of Block && Number Of Thread

// ----------------------------------------------------------------

dimThread.x = 512;

dimBlock.x = 1;

// ----------------------------------------------------------------

// Memory allocate in the Host

// ----------------------------------------------------------------

h_InputData = (float *)malloc(sizeof(float) * NumOfData );

memset( ( void * )h_InputData, ( 0 ), ( sizeof(float) * NumOfData ) );

// ----------------------------------------------------------------

// Set Input Data Value

// ----------------------------------------------------------------

for( i = 0 ; i < NumOfData ; i++ ){

    h_InputData[i] = -0.5 + ( (float)i / NumOfData );

}

// ----------------------------------------------------------------

// Memory allocate in the devie

// ----------------------------------------------------------------

CudaRet = cudaMalloc( (void **)&d_InputData, sizeof(float) * NumOfData );

CudaRet = cudaMalloc( (void **)&d_flag, sizeof(int) );

CudaRet = cudaMalloc( (void **)&d_list, sizeof(float) * NumOfData );

CudaRet = cudaMalloc( (void **)&d_listIdx, sizeof(int) );

CudaRet = cudaMalloc( (void **)&d_NumOfMatchData, sizeof(int) );

// ----------------------------------------------------------------

// Memory copy from host to device

// ----------------------------------------------------------------

CudaRet = cudaMemcpy( (void *)d_InputData, (void *)h_InputData,

                                         sizeof(float) * NumOfData, cudaMemcpyHostToDevice );

CudaRet = cudaMemset( (void *)d_list, ( 0 ), ( sizeof(float) * NumOfData ) );

CudaRet = cudaMemcpy( (void *)d_listIdx, (void *)&h_listIdx,

                                         sizeof(int), cudaMemcpyHostToDevice );

CudaRet = cudaMemcpy( (void *)d_flag, (void *)&h_flag,

                                          sizeof(int), cudaMemcpyHostToDevice );

CudaRet = cudaMemcpy( (void *)d_NumOfMatchData, (void *)&h_NumOfMatchData,

                                         sizeof(int), cudaMemcpyHostToDevice );

// ----------------------------------------------------------------

// call kernel function

// ----------------------------------------------------------------

kernel_function<<<dimBlock,dimThread>>>( d_InputData,

                                                           NumOfData,

                                                           d_flag,

                                                           d_list,

                                                           d_listIdx,

                                                           d_NumOfMatchData );

CudaRet = cudaThreadSynchronize();

// ----------------------------------------------------------------

// Memory copy from devie to host

// ----------------------------------------------------------------

CudaRet = cudaMemcpy( (void *)&h_NumOfMatchData, (void *)d_NumOfMatchData,

                                         sizeof(int), cudaMemcpyDeviceToHost );

printf(“----------------------------------------------\n”);

printf(“NumOfMatchData = %d\n”, h_NumOfMatchData );

printf(“----------------------------------------------\n”);

// ----------------------------------------------------------------

// Memory allocate in the host that is according to “h_NumOfMatchData”

// ----------------------------------------------------------------

h_list = (float *)malloc( sizeof(float) * h_NumOfMatchData );

memset( h_list, ( 0 ), ( sizeof(float) * h_NumOfMatchData ) );

CudaRet = cudaMemcpy( (void *)h_list, (void *)d_list,

                                          sizeof(float) * h_NumOfMatchData, cudaMemcpyDeviceToHost );

for( i = 0 ; i < h_NumOfMatchData ; i++ ){

    printf("[INFO][i=%d] : (%f)\n", i,h_list[i] );

    //sleep( 1 );

}

// ----------------------------------------------------------------

// free the memory in the device

// ----------------------------------------------------------------

cudaFree(d_InputData);

cudaFree(d_flag);

cudaFree(d_list);

cudaFree(d_listIdx);

cudaFree(d_NumOfMatchData);

free(h_InputData);

return (0);

}

[/codebox]