Not working correctly new () and malloc () inside the kernel, why?

global void gpu_test_malloc(int * a_t_size)
{

	int64 t_size=a_t_size[0];//in 40435848;
	void * t_void=NULL;
	label_test:		
	t_void=malloc(t_size);
	if(t_void==NULL)
	{
		t_size--;
		goto label_test;
	};
	a_t_size[1]=t_size;//out 7674032

};

//test
int * host_mem=new int[2];
int * device_mem;
host_mem[0]=40435848;//need
cudaStatus=cudaMalloc(&device_mem,sizeof(int)*2);
if (cudaStatus != cudaSuccess)throw;

cudaStatus=cudaMemcpy(device_mem,host_mem,sizeof(int)*2,cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)throw;

gpu_test_malloc<<<1,1>>>(device_mem);

cudaStatus=cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)throw;

cudaStatus=cudaMemcpy(host_mem,device_mem,sizeof(int)*2,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)throw;


//===============
host_mem[0]=40435848
host_mem[1]= 7674032
//===============
GTX Titan(driver 320.14)
Nsight 3.0.0.13123
CUDA 5.0
64bit,compute_30,sm_30
//==============

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>

global void gpu_test_malloc(int * a_t_size)
{

__int64 t_size=a_t_size[0];//40435848;
void * t_void=NULL;
label_test:
t_void=malloc(t_size);
if(t_void==NULL)
{
	t_size--;
	goto label_test;
};
a_t_size[1]=t_size;

};

int main()
{

cudaError_t cudaStatus;
try
{
int * host_mem=new int[2];
int * device_mem;
host_mem[0]=40435848;//это нужный мне объём
cudaStatus=cudaMalloc(&device_mem,sizeof(int)*2);
if (cudaStatus != cudaSuccess)throw;
cudaStatus=cudaMemcpy(device_mem,host_mem,sizeof(int)*2,cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)throw;

	gpu_test_malloc<<<1,1>>>(device_mem);

	cudaStatus=cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess)throw;
	cudaStatus=cudaMemcpy(host_mem,device_mem,sizeof(int)*2,cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess)throw;
	cudaStatus=cudaFree(device_mem);
	return cudaStatus;
}catch(...)
{
	return cudaStatus;
};

}

GTX Titan(драва 335.23)
Nsight 3.2.2.13351
CUDA 5.5
64bit,compute_35,sm_35

===============
host_mem[0]=40435848
host_mem[1]= 7674032

The device side malloc() is a suballocator from a fixed size heap. Given that the kernel loop is continuously allocating memory, I suspect the backing heap is being exhausted. The size of the heap can be changed using the CUDA API :

cudaDeviceSetLimit(cudaLimitMallocHeapSize, size_t size)

. More information about device side malloc() can be obtained in the toolkit documentation : http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations