I want to allocate a lot of memory in CUDA.

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>

global void reduce0 (int *g_idata, int *g_odata, int n)
{
shared int sdata[5000];

// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;


if (i < n)
{
	sdata[tid] = g_idata[i];
	__syncthreads();


	// do reduction in shared mem
	for (unsigned int s = 1; s < blockDim.x; s *= 2)
	{
		// modulo arithmetic is slow!
		if ((tid % (2 * s)) == 0)
		{
			sdata[tid] += sdata[tid + s];
		}

		__syncthreads();
	}

	// write result for this block to global mem
	if (tid == 0)
	{
		g_odata[blockIdx.x] = sdata[0];
	}

}

}

int main()
{

const int arraySize = 100 * sizeof(int);
int a[arraySize];
int* g_idata_d;
int* g_odata_d;
int g_odata[1] = { 0 };




int i = 0;

for (i = 0; i < arraySize; i++)
{
	a[i] = i + i + i * i;
	
}


cudaMalloc((void**)&g_idata_d, sizeof(int)*arraySize);
cudaMalloc((void**)&g_odata_d, sizeof(int)*1);

cudaMemcpy(g_idata_d, a, sizeof(int)*arraySize, cudaMemcpyHostToDevice);


reduce0 << <1, arraySize, arraySize * sizeof(int) >> > (g_idata_d, g_odata_d, arraySize);


cudaMemcpy(g_odata, g_odata_d, sizeof(int), cudaMemcpyDeviceToHost);



printf(" sum = %d\n", g_odata[0]);



cudaFree(g_idata_d);
cudaFree(g_odata_d);




getchar();
return 0;

}

If const int arraySize is 1025 or more, it seems to be an overflow. Please tell me how to fix it.

it seems to be an overflow. Please tell me how to fix it.

Nope, there seems to be no error checking.

If you had proper CUDA error checking (google this term), then you’d have found out that
the kernel doesn’t launch because you’re specifying a bigger thread block size than allowed.

Two solutions come to mind

  1. split your work across multiple thread blocks and then aggregate the partial results e.g. with atomic adds to global memory.
  2. make your single block operate on more than one data value per thread