memory exhausted on GPU

global void GPU_Offload(int nt,int total_np,int num_recv,int l_x,int l_z,float *d_amp,int nx,int nz,int *x_r,int *z_r)
{

printf("\n\n\nin device at start   %d ,%d",nx,nz);  nx=14000 and nz=3600
//fflush(0);
int a=num_recv;
	PREV_PR_o prev;
	prev.px_l1=(float*)malloc(sizeof(float)*nz);
            prev.px_l2=(float*)malloc(sizeof(float)*nz);
            prev.px_r1=(float*)malloc(sizeof(float)*nz);
            prev.px_r2=(float*)malloc(sizeof(float)*nz);
	
	prev.pz_t1=(float*)malloc(sizeof(float)*nx);
            prev.pz_t2=(float*)malloc(sizeof(float)*nx);
            prev.pz_b1=(float*)malloc(sizeof(float)*nx);
            prev.pz_b2=(float*)malloc(sizeof(float)*nx);





 printf("%s %d \n",__FILE__,__LINE__);
int i,t_step,j;
float **p1,**p2,**econs,**tmp,*pw;
 pw=(float*)malloc(sizeof(float)*nx*nz);
if(pw==NULL)
printf("error in allocation");                        // gives error here do i ran out of memory

for(i=1;i<10;i++)
{
	pw[i]=0.0f;
	//pw1[threadIdx.x]=0.0f;
}

}

I think it is possible to set the size of the permissible heap space per kernel.

The default values are too low for your particulare use case.

I am not sure I understood correctly, but is it each thread allocating memory?

Read about malloc in the programming guide:

http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations

There is a device heap limit, and you can increase it.