global arrays that can be used in different kernel calls

I would like to declare a global array at some location in the device global memory that I can use in two different kernel calls.
It goes like this

device float* global_data;

main()
{
cudaMalloc(global_data);
kernel1<<>>(blah); //access global data
kernel2<<>>(blah); //access the same global data again
}

Is this possible?

Yes, but when I have done it I have used cudaMalloc to create the array in device memory,

or fixed size shared array e.g.

shared short shPosns[512];

You do need to put data into it, which can be either by copying an array from host to the device, or by writing to it from a kernel.

e.g.

cudaMemcpy( d_Counts, h_Counts, sizeT_Counts, cudaMemcpyHostToDevice );

Two kernels in following code share some arrays and are inside a loop that executes hundreds of times, so each kernel is called hundreds of times.

cudaMemcpy( d_line, h_line, count, cudaMemcpyHostToDevice );  // copy host array h_line to device array d_line

      checkCUDAError("cudaMemcpy 1");

EndFieldKernel<<< dimGrid , dimBlock >>> ( d_line, (int)count, d_Posns, d_Counts, d_unProcd);

      checkCUDAError("kernel 1 execution");

cudaThreadSynchronize();

      checkCUDAError("Synch");

toFloatKernel<<< dimGrid , dimBlock >>> ( d_line, d_Data, count, d_Posns, d_Counts, d_unProcd, inArrayOffset, d_resultArrayElementsDone);

      checkCUDAError("kernel 2 execution");

cudaThreadSynchronize();

      checkCUDAError("Synch");

cudaMemcpy( h_resultArrayElementsDone, d_resultArrayElementsDone, sizeT_resultArrayElementsDone, cudaMemcpyDeviceToHost ); // copy device array d_resultArrayElementsDone to host array h_resultArrayElementsDone

      checkCUDAError("cudaMemcpy offset back");