Hi, all. I am a new hand in using CUDA programming.
I have a problem when I write my program.
I want to get the match data from the Input Data and put these data into list.
I use the “emulator” to debug my program and the result that I want is correct.
I use the CUDA to run the same program but the result is not correct.
Anyone can tell me why the results are not the same ?
If I want to get the correct result from the CUDA, what should I do ?
Thanks~
This is the kernel function
[codebox]
// =======================================================
// Kernel function:
// @d_InputData : Input Data
// @NumOfData : Number of Input Data
// @d_flag : flag that is used to lock the “critical section”
// @d_list : Put the match data in this list
// @d_list : list index
// @d_NumOfMatchData : Number of match data that is match
// =========================================================
global void
kernel_function( float *d_InputData,
int NumOfData,
int *d_flag,
float *d_list,
int *d_listIdx,
int *d_NumOfMatchData )
{
Back:
int ThreadIdx = threadIdx.x;
// -----------------------------------------------------------
// lock by using d_flag[0]
// -----------------------------------------------------------
atomicCAS( &d_flag[0], 0, 1 );
if( atomicCAS( &d_flag[0], 1, 0 ) != 1 ){
goto Back;
}
if( d_InputData[ThreadIdx] >= 0.0 ){
atomicAdd( &d_NumOfMatchData[0], 1 );
d_list[ d_listIdx[0] ] = d_InputData[ThreadIdx];
atomicAdd( &d_listIdx[0], 1 );
}
__syncthreads();
}
[/codebox]
This is the main …
[codebox]
// ======================================================
// Main
// =====================================================
int main( int argc, char **argv )
{
cudaError_t CudaRet;
dim3 dimThread, dimBlock;
int i = 0;
int NumOfData = 512;
float *h_InputData = NULL,
*d_InputData = NULL,
*d_list = NULL,
*h_list = NULL;
int *d_flag = NULL,
*d_listIdx = NULL,
*d_NumOfMatchData = NULL,
h_flag = 0,
h_listIdx = 0,
h_NumOfMatchData = 0;
// ----------------------------------------------------------------
// Set Number Of Block && Number Of Thread
// ----------------------------------------------------------------
dimThread.x = 512;
dimBlock.x = 1;
// ----------------------------------------------------------------
// Memory allocate in the Host
// ----------------------------------------------------------------
h_InputData = (float *)malloc(sizeof(float) * NumOfData );
memset( ( void * )h_InputData, ( 0 ), ( sizeof(float) * NumOfData ) );
// ----------------------------------------------------------------
// Set Input Data Value
// ----------------------------------------------------------------
for( i = 0 ; i < NumOfData ; i++ ){
h_InputData[i] = -0.5 + ( (float)i / NumOfData );
}
// ----------------------------------------------------------------
// Memory allocate in the devie
// ----------------------------------------------------------------
CudaRet = cudaMalloc( (void **)&d_InputData, sizeof(float) * NumOfData );
CudaRet = cudaMalloc( (void **)&d_flag, sizeof(int) );
CudaRet = cudaMalloc( (void **)&d_list, sizeof(float) * NumOfData );
CudaRet = cudaMalloc( (void **)&d_listIdx, sizeof(int) );
CudaRet = cudaMalloc( (void **)&d_NumOfMatchData, sizeof(int) );
// ----------------------------------------------------------------
// Memory copy from host to device
// ----------------------------------------------------------------
CudaRet = cudaMemcpy( (void *)d_InputData, (void *)h_InputData,
sizeof(float) * NumOfData, cudaMemcpyHostToDevice );
CudaRet = cudaMemset( (void *)d_list, ( 0 ), ( sizeof(float) * NumOfData ) );
CudaRet = cudaMemcpy( (void *)d_listIdx, (void *)&h_listIdx,
sizeof(int), cudaMemcpyHostToDevice );
CudaRet = cudaMemcpy( (void *)d_flag, (void *)&h_flag,
sizeof(int), cudaMemcpyHostToDevice );
CudaRet = cudaMemcpy( (void *)d_NumOfMatchData, (void *)&h_NumOfMatchData,
sizeof(int), cudaMemcpyHostToDevice );
// ----------------------------------------------------------------
// call kernel function
// ----------------------------------------------------------------
kernel_function<<<dimBlock,dimThread>>>( d_InputData,
NumOfData,
d_flag,
d_list,
d_listIdx,
d_NumOfMatchData );
CudaRet = cudaThreadSynchronize();
// ----------------------------------------------------------------
// Memory copy from devie to host
// ----------------------------------------------------------------
CudaRet = cudaMemcpy( (void *)&h_NumOfMatchData, (void *)d_NumOfMatchData,
sizeof(int), cudaMemcpyDeviceToHost );
printf(“----------------------------------------------\n”);
printf(“NumOfMatchData = %d\n”, h_NumOfMatchData );
printf(“----------------------------------------------\n”);
// ----------------------------------------------------------------
// Memory allocate in the host that is according to “h_NumOfMatchData”
// ----------------------------------------------------------------
h_list = (float *)malloc( sizeof(float) * h_NumOfMatchData );
memset( h_list, ( 0 ), ( sizeof(float) * h_NumOfMatchData ) );
CudaRet = cudaMemcpy( (void *)h_list, (void *)d_list,
sizeof(float) * h_NumOfMatchData, cudaMemcpyDeviceToHost );
for( i = 0 ; i < h_NumOfMatchData ; i++ ){
printf("[INFO][i=%d] : (%f)\n", i,h_list[i] );
//sleep( 1 );
}
// ----------------------------------------------------------------
// free the memory in the device
// ----------------------------------------------------------------
cudaFree(d_InputData);
cudaFree(d_flag);
cudaFree(d_list);
cudaFree(d_listIdx);
cudaFree(d_NumOfMatchData);
free(h_InputData);
return (0);
}
[/codebox]