Hi all,
I am a newbie here. I compiled my program with and without -G option. They give me different result.
The one with “-G” is correct. It seems that the different is:
This option:
* Forces -O0 (mostly unoptimized) compilation
* Spills all variables to local memory (and will probably slow program execution) .
I used -O0 to compile without -G. The result is still incorrect. So the problem might be the second one. Since it works fine for “-G” option, I cannot use CUDA-GDB to debug…
Anyone encounter similar situation before? I know that it’s hard to answer such question without given the original code. But if you could give me a general idea, I would appreciate a lot! I may packed my ugly code and upload later. thank you for your time…
Here is my code:
Function Barrier is used to synchronize treads in different block. The correct result are: all "1"s for count variables. The count should be added by one since new_var is not -1. However, without “-G” compilation, the result is all "0"s
#include <iostream>
#include <cstdlib>
using namespace std;
__device__ int lock = 0;
__device__ int new_var = -1;
#define CL_NUM 22
#define BLOCKS_PER_GRID 2
__device__ void barrier (int* goalVal)
{
if (threadIdx.x == 0) {
atomicAdd(&lock, 1);
while (lock != *goalVal)
;
}
__syncthreads();
*goalVal += BLOCKS_PER_GRID;
}
__global__ void gsatKernel(int* left_array)
{
int cl_no = blockIdx.x * blockDim.x + threadIdx.x;
int goalVal = 2;
int count = 0;
if (cl_no == 0) {
new_var = 1;
}
barrier(&goalVal);
if (new_var != -1) {
count ++;
}
left_array[cl_no] = count;
}
int main(int argc, char** argv)
{
// Data structure for test
int* host_left_array = new int[CL_NUM];
int* dev_left_array;
size_t dev_left_size = CL_NUM * sizeof(int);
cudaMalloc( (void**) &(dev_left_array), dev_left_size);
// Call Kernel
int blocksPerGrid = BLOCKS_PER_GRID;
int threadsPerBlock = ( CL_NUM + blocksPerGrid - 1 ) / blocksPerGrid;
gsatKernel<<<blocksPerGrid, threadsPerBlock>>> (dev_left_array);
// test result
cudaMemcpy(host_left_array, dev_left_array, dev_left_size,
cudaMemcpyDeviceToHost);
cout << "-----------------test information:---------------------" << endl
<< "left:" << endl;
for (int i = 0; i < CL_NUM; i++) {
cout << i <<":" << host_left_array[i] << " ";
}
cout << endl;
// free the data for test
cudaFree(dev_left_array);
delete [] host_left_array;
return 0;
}