Wrong summation in a kernel function

Hi,

The summation result of the following should be,
to the best of my understanding, 30. However it
is only 20, which means, the function was called
only one. Please pay attention to the two functions
that were tried in the following example:

Kind regards,
Eytan Suchard,
ANB - Applied Neural Biometrics.

// Compiled in visual studio 2010, CUDA 7.5.
#include <stdio.h>
#include <conio.h>

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

global
void DEVICE_KERNEL_check_for_wrong_calling_order(int *cndp_check)
{
if (threadIdx.x != 0)
{ cndp_check[0] += cndp_check[threadIdx.x];
}
}

global
void DEVICE_KERNEL_check_for_wrong_calling_order2(int *cndp_check)
{
shared int cfwco_sum[1];

if (0 == threadIdx.x) cfwco_sum[0] = 0;

__syncthreads();

cfwco_sum[0] += cndp_check[threadIdx.x];

__syncthreads();

if (0 == threadIdx.x) cndp_check[0] = cfwco_sum[0];
}

bool DEVICE_check_CUDA_is_ok(void)
{
cudaError_t cCio_cuda_err;
int cCio_check_cuda_host[3] = {0,10,20};
int *cCio_check_cuda_device;

cCio_cuda_err = cudaSetDevice(0);

if (cudaSuccess != cCio_cuda_err) return false;

cudaFree(0);

// ------------------------------------------------------
// Test CUDA for wrong calling order.
// Otherwise, arrays to kenernel are truncted to half !!!
// ------------------------------------------------------

cCio_check_cuda_device = SMART_ARRAY_DEVICE_alloc(3);

cCio_cuda_err = cudaMalloc(&cCio_check_cuda_device,3*sizeof(int));

if (cudaSuccess != cCio_cuda_err) return false;

if (cudaSuccess != cudaMemcpy(cCio_check_cuda_device,
cCio_check_cuda_host,
3 * sizeof(int),
cudaMemcpyHostToDevice))
{ return false;
}

// Was tried also for DEVICE_KERNEL_check_for_wrong_calling_order.
DEVICE_KERNEL_check_for_wrong_calling_order2
<<<dim3(1,1,1),dim3(3,1,1) >>>(cCio_check_cuda_device);

cCio_cuda_err = cudaGetLastError();
if (cudaSuccess != cCio_cuda_err) return false;

cCio_cuda_err = cudaDeviceSynchronize();

if (cudaSuccess != cCio_cuda_err) return false;

if (cudaSuccess != cudaMemcpy(cCio_check_cuda_host,
cCio_check_cuda_device,
3 * sizeof(int),
cudaMemcpyDeviceToHost))
{ return false;
}

if (30 != cCio_check_cuda_host[0])
{
// Result is 20 but it should be 30 = 10 + 20.
// summation occurs only once instead of twice
// in DEVICE_KERNEL_check_for_wrong_calling_order
// and 3 times in
// DEVICE_KERNEL_check_for_wrong_calling_order2.

puts("Fatal error: CUDA device is ill configured.");

while(_kbhit()) _getch();
getchar();

puts("Please press Enter.");

return false;

}

// ------------------------------------------------------
// End of ‘Test CUDA for wrong calling order’.
// ------------------------------------------------------

return true;
}

void main(void)
{
if (DEVICE_check_CUDA_is_ok()) return 0;

return -1;
}

As multiple threads run at the same time,

cndp_check[0] += cndp_check[threadIdx.x];

causes data-race. replace with:

atomicAdd(cndp_check, cndp_check[threadIdx.x]);

Many thanks, it works.

how about using thrust::reduce to make sum? no-need to make kernel-function.