I am trying to re-write some shared-memory CPU code with CUDA, and I got some problems in finding a thread-specific substitute. In other words, I want each CUDA thread to work with some local variables, without interfering with each other.
A toy code I wrote is pasted below, and I got “Bad” results because of race conditions. I do not need the atomic-add, but that works fine. So the question is, how to make sure no other thread wants the same index of the var array?
__global__ void thread_local_var_test_kernal1(int** worker_array, int* var, int size)
{
const int
blockId = blockIdx.y * gridDim.x + blockIdx.x,
idx = blockId * blockDim.x + threadIdx.x;
if (idx < size)
{
int* worker = worker_array[threadIdx.x]; // <------- I want worker array to work without race-condition
var[threadIdx.x]++; // race condition
//atomicAdd(&var[threadIdx.x], 1); // OK
}
}
static void thread_local_var_test_kernal_test1(int size)
{
int threadsPerBlock = block_dim;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
int* dev_var = 0;
cudaMalloc(&dev_var, threadsPerBlock * sizeof(int));
cudaMemset(dev_var, 0, threadsPerBlock * sizeof(int));
int** dev_worker_array = 0;
cudaMalloc(&dev_worker_array, threadsPerBlock * sizeof(int*));
thread_local_var_test_kernal1 <<<blocksPerGrid, threadsPerBlock>>>(dev_worker_array, dev_var, size);
cudaDeviceSynchronize();
printf("================================ CUDA kernal is completed ================================\n");
int* host_var = (int*)malloc(threadsPerBlock * sizeof(int));
cudaMemcpy(host_var, dev_var, threadsPerBlock * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_var);
int sum_var = 0;
for (int i = 0; i < threadsPerBlock; i++)
{
printf("%d, ", host_var[i]);
sum_var += host_var[i];
}
printf("CUDA sum = %d (%d is expected).\n", sum_var, size);
(sum_var == size) ? printf("Good.\n") : printf("Bad.\n");
free(host_var);
}