I write this code:
#include <iostream>
#include <fstream>
template <typename T1, typename T2>
__global__ void vec_rm(T1 *dev_in, T1 *dev_out, T2 *dev_size)
{
extern __shared__ double arr[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < *dev_size)
{
int tid = threadIdx.x;
arr[tid] = dev_in[i];
__syncthreads();
for (int s = 1; s < blockDim.x; s*= 2)
{
if (tid % (2*s) == 0)
{
arr[tid] += arr[tid+s];
}
__syncthreads();
}
printf("dev_in[%d] = %lf\n", i, dev_in[i]);
if (tid == 0)
{
dev_out[blockIdx.x] = arr[0];
printf("dev_block[%d] = %lf\n", blockIdx.x, arr[0]);
}
}
}
int main()
{
double a[] = {1,2,3,90,28,45,-8};
int size = 7, TC = 3, BL = size / TC,*dev_size;
double sum = 0,*dev_a, *dev_result;
double result[BL];
cudaMalloc((void**)&dev_result, BL*sizeof(double));
if (size%TC == 0)
{
cudaMalloc( (void**)&dev_a, size*sizeof(double));
cudaMemcpy( dev_a, a, size*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_size, sizeof(int));
cudaMemcpy(dev_size, &size, sizeof(int), cudaMemcpyHostToDevice);
vec_rm<<<BL,TC, TC*sizeof(double)>>>(dev_a, dev_result, dev_size);
}
else
{
int gpu_size = size/TC*TC;
cudaMalloc( (void**)&dev_a, gpu_size*sizeof(double));
cudaMemcpy( dev_a, a, gpu_size*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_size, sizeof(int));
cudaMemcpy(dev_size, &gpu_size, sizeof(int), cudaMemcpyHostToDevice);
vec_rm<<<BL,TC, TC*sizeof(double)>>>(dev_a, dev_result, dev_size);
for (int k = gpu_size; k < size; k++)
{
sum += a[k];
}
std::cout << "CPU sum = " << sum << std::endl;
}
cudaMemcpy(result, dev_result, BL*sizeof(double), cudaMemcpyDeviceToHost);
for (int k = 0; k < BL; ++k)
{
sum += result[k];
}
std::cout << "CPU+GPU sum = " << sum << std::endl;
return 0;
}
And I get different result every launch, I try to find out error, but I don’t see race condition or something else.