How can I synchronize writing to a “out” variable in the funn() function?

__global__ void funn(float* in_vec, float* out)



int bx = blockIdx.x;

int tx = threadIdx.x;



unsigned int i = tx+bx*blockDim.x;

*out += in_vec[i];





float *tmp;

float *tmp1;

float tmp2 = 0;

float tmp_h[19100];

for(int i=0; i<19100; i++)


tmp_h[i] = 0.1;


CUDA_SAFE_CALL(cudaMalloc((void**)&tmp, sizeof(float)*19100));

CUDA_SAFE_CALL(cudaMemcpy(tmp, tmp_h, sizeof(float)*19100,

                              cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMalloc((void**)&tmp1, sizeof(float)));

CUDA_SAFE_CALL(cudaMemset(tmp1, 0, sizeof(float)) );


CUDA_SAFE_CALL(cudaMemcpy(&tmp2, tmp1, sizeof(float),

                              cudaMemcpyDeviceToHost) );

printf("%f \n", tmp2);

Thanks in advance!

You can’t.

Since your kernel is attempting to perform a sum reduction, have a look at the prefixSum example in the SDK. You don’t need the full prefix sum, though, just the first “upsweep phase”, which is described very well in the prefix sum whitepaper.