CUDA Global Menory

this is my code

 #include "stdafx.h"
  #include <iostream>
   using namespace std;

  #define n 10
  __device__ int glMem[n];

  __global__ void initVals()
for(int i=0;i<n;i++)
	glMem[i] = 0;

 __global__ void test(int *out)
for(int i=0;i<n;i++)
	out[i] = 10;

int main()
const size_t sz = size_t(n)*sizeof(int);
int *devMem;
cudaMalloc((void **)&devMem, sz);
test<<<1, 1>>>(devMem);
int *hoMem=new int[n];
cudaMemcpy(hoMem, devMem,sz, cudaMemcpyDeviceToHost);

for(int i=0;i<n;i++)
return 0;

IN this code I define


to size n. If I dont know the size earlier hw can I define??
for example I need to define like this.

__device__ int *glMem;

It doesnt work. Please give some code sample…

Maybe you need to add some threadIdx.x to the kernel to make it work.

Try something like:

// for(int i=0;i<n;i++)
out[threadIdx.x] = 10;

Also launch with 10 threads:

test<<<1, 10>>>(devMem);

Oh wait I think I see problem… you need to pass the address of the array.

Normally this would be:

// test<<<1, 10>>>(&devMem); // probably wrong example of me *

But maybe devmem is already an address ;) Yeah probably so *

Anyway it seems you want to define memory dynamically.

So it would be something like:

global void KernelInitializeMemory( int *MemoryPointer, int MemoryElements )
MemoryPointer[threadIdx.x] = 0;


global void KernelUseMemory( int *MemoryPointer, int MemoryElements )
MemoryPointer[threadIdx.x] = 10;

// allocate cuda memory on host, see api’s.

int MemoryElements;
int *CudaMemory;

MemoryElements = 10;

// pseudo code, find correct api.
cudaAlloc( CudaMemory, MemoryElements * sizeof(int) );

// call kernels
KernelInitializeMemory<<<1, 10>>>( CudaMemory, MemoryElements );

KernelUseMemory<<<1, 10>>>( CudaMemory, MemoryElements );

// copy back

// clean up etc.

If you have compute 2.1 then you can also allocate memory inside the kernel itself with malloc like so:

device int *MyMemory;

global void Kernel()
MyMemory = malloc( sizeof(int) * N );


other kernel

free( MyMemory );