#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
global void reduce0 (int *g_idata, int *g_odata, int n)
{
shared int sdata[5000];
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
// modulo arithmetic is slow!
if ((tid % (2 * s)) == 0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0)
{
g_odata[blockIdx.x] = sdata[0];
}
}
}
int main()
{
const int arraySize = 100 * sizeof(int);
int a[arraySize];
int* g_idata_d;
int* g_odata_d;
int g_odata[1] = { 0 };
int i = 0;
for (i = 0; i < arraySize; i++)
{
a[i] = i + i + i * i;
}
cudaMalloc((void**)&g_idata_d, sizeof(int)*arraySize);
cudaMalloc((void**)&g_odata_d, sizeof(int)*1);
cudaMemcpy(g_idata_d, a, sizeof(int)*arraySize, cudaMemcpyHostToDevice);
reduce0 << <1, arraySize, arraySize * sizeof(int) >> > (g_idata_d, g_odata_d, arraySize);
cudaMemcpy(g_odata, g_odata_d, sizeof(int), cudaMemcpyDeviceToHost);
printf(" sum = %d\n", g_odata[0]);
cudaFree(g_idata_d);
cudaFree(g_odata_d);
getchar();
return 0;
}
If const int arraySize is 1025 or more, it seems to be an overflow. Please tell me how to fix it.