TensorRT2.1 plugin fails in TX2

when I execute the code ,there is always a error(“Cuda fails: 4Aborted (core dumped)”) about Cuda.
The following is my code.

class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory
{
public:
// caffe parser plugin implementation
bool isPlugin(const char* name) override
{
return( !strcmp(name, “relu1”));
}

// deserialization plugin implementation    
virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override    
{         
	std::cout<<"enter PluginFactory class and enter createPlugin 111111111 function..."<< "layername:"<<layerName<<std::endl;
	assert(isPlugin(layerName));
	if (!strcmp(layerName, "relu1"))
	{
		assert(mLeakyReluPlugin1 == nullptr);
		assert(nbWeights == 0 && weights == nullptr);
		mLeakyReluPlugin1 = std::unique_ptr<LeakyReluPlugin>(new LeakyReluPlugin());
		std::cout<<"create relu1 Iplugin..."<<std::endl;
		return mLeakyReluPlugin1.get();
	}else{
		assert(0);
               return nullptr;
	}	 
}

IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override    
{           
	std::cout<<"enter PluginFactory class and enter createPlugin 222222 function..."<< "layername:"<<layerName<<std::endl;
	assert(isPlugin(layerName));
	if (!strcmp(layerName, "relu1"))
	{
	//	assert(mLeakyReluPlugin1 == nullptr);
		mLeakyReluPlugin1 = std::unique_ptr<LeakyReluPlugin>(new LeakyReluPlugin(serialData, serialLength));
		return mLeakyReluPlugin1.get();
	}else {
		assert(0);
		return nullptr;
	}
	
}

void destroyPlugin()
{
	mLeakyReluPlugin1.release(); 	mLeakyReluPlugin1 =nullptr;
}


std::unique_ptr<LeakyReluPlugin> mLeakyReluPlugin1{ nullptr};

};

class LeakyReluPlugin : public IPlugin
{
public:
LeakyReluPlugin() {}
LeakyReluPlugin(const void* buffer, size_t size)
{
printf(“enter LeakyReluPlugin …\n”);
assert(size == (3sizeof(int)));
const int
d = reinterpret_cast<const int*>(buffer);
dimsSrc = DimsCHW{d[0], d[1], d[2]};
printf(“input_c = %d \t input_h = %d \t input_w = %d \n”, d[0], d[1], d[2]);
}

	int getNbOutputs() const override    
	{        
		return 1;    
	}	    

	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override    
	{       
		printf("enter getOutputDimensions ...\n");
		assert(nbInputDims == 1);        
		assert(index == 0);        
		assert(inputs[index].nbDims == 3);
		printf("input_c = %d \t input_h = %d \t input_w = %d \n", inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
		return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);    
	}    

	int initialize() override    
	{     
		printf("enter initialze... \n");
		return 0;    
	}    

	void terminate() override    
	{    
		printf("enter terminate... \n");
	}   

	size_t getWorkspaceSize(int) const override    
	{       
		printf("enter getWorkspaceSize...\n");
		return 0;    
	}
	    
	int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override   
	{ 
		//float *bottom_data = reinterpret_cast< float*>(inputs[0]);
		//float *top_data = reinterpret_cast<float*>(outputs[0]);
		
		int count = dimsSrc.c()*dimsSrc.c()*dimsSrc.c();
		float negative_slope = 0.10;
		//PReLUForward( (float*)inputs[0], (float*)outputs[0], count, negative_slope, stream);
		
		int srcSize[] {dimsSrc.c(), dimsSrc.h(), dimsSrc.w()};
		PReLUForward( (float*)inputs[0], (float*)outputs[0], srcSize, negative_slope, stream);
		return 0;    
	}

	size_t getSerializationSize() override    
	{        
		printf("enter getSerializationSize ...\n");
		return 3*sizeof(int); 
	}    

	void serialize(void* buffer) override    
	{      
		printf("enter serialize...\n");
		int* d = reinterpret_cast<int*>(buffer);
		d[0] = dimsSrc.c(); 
		d[1] = dimsSrc.h(); 
		d[2] = dimsSrc.w();
		printf("input_c = %d \t input_h = %d \t input_w = %d \n", d[0] , d[1] , d[2]);
		
	}   

	void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int)    override    
	{     
		printf("enter configure ...\n");
		printf("input_c = %d \t input_h = %d \t input_w = %d \n", inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
		dimsSrc = DimsCHW{inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]};
	}
	
	protected:    
		DimsCHW dimsSrc ;

};

#cuda program (mathfunctions.cu)
global void PReLU( float* bottom_data, float* top_data, const int* srcSize, float negative_slope)
{
printf(“enter GPU PReLU…\n”);
int nthreads = srcSize[0] * srcSize[1] * srcSize[2];
#if 0
CUDA_KERNEL_LOOP(index, nthreads) {
printf(“22222222222…\n”);
top_data[index] = bottom_data[index] > 0 ? bottom_data[index] : bottom_data[index] * negative_slope;
printf(“top_data[%d]:%f\n”,index,top_data[index]);
}
#else
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= nthreads)
return ;
if(bottom_data[index] > 0){
top_data[index] = bottom_data[index];
}else{
top_data[index] = bottom_data[index] * negative_slope;
}
printf(“top_data[%d]:%f\n”,index,top_data[index]);
#endif
}

void PReLUForward( float* bottom_data, float* top_data, const int* srcSize, float negative_slope, cudaStream_t stream)
{
printf(“enter GPU PReLUForward…\n”);
int nthreads = srcSize[0] * srcSize[1] * srcSize[2];
int block_size = 256;
int grid_size = (nthreads + block_size - 1)/block_size;
fprintf(stderr, “CUDA kernel launch with %d blocks of %d threads\n”,grid_size, block_size);
PReLU <<< grid_size, block_size, 0, stream >>> ( bottom_data, top_data, srcSize, negative_slope);
// PReLU <<< TENSORRT_GET_BLOCKS(nthreads), TENSORRT_CUDA_NUM_THREADS, 0, stream >>> ( bottom_data, top_data, srcSize, negative_slope);
return ;
}

//mathfunctions.h
#define CUDA_KERNEL_LOOP(i, n)
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < (n);
i += blockDim.x * gridDim.x)

const int TENSORRT_CUDA_NUM_THREADS = 256;

// CUDA: number of blocks for threads.
inline int TENSORRT_GET_BLOCKS(const int N) {
return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS;
}

void PReLUForward( float* bottom_data, float * top_data, const int* srcSize, float negative_slope, cudaStream_t stream);

This is the debugging information:

start to parse the model…
enter PluginFactory class and enter createPlugin 111111111 function…layername:relu1
create relu1 Iplugin…
enter getOutputDimensions …
input_c = 64 input_h = 224 input_w = 224
start to create the engine by the builder and network…
enter configure …
input_c = 64 input_h = 224 input_w = 224
enter getWorkspaceSize…
enter getWorkspaceSize…
enter initialze…
enter getSerializationSize …
enter serialize…
input_c = 64 input_h = 224 input_w = 224
enter PluginFactory class and enter createPlugin 222222 function…layername:relu1
enter LeakyReluPlugin …
input_c = 64 input_h = 224 input_w = 224
enter initialze…
Bindings after deserializing…
Binding 0 (data): Input.
Binding 1 (result): Output.
Binding 2 (pool1): Output.
start to execte the enqueue by asyn mode…
enter enqueue …
input_c = 64 input_h = 224 input_w = 224
enter GPU PReLUForward…
CUDA kernel launch with 12544 blocks of 256 threads
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
enter GPU PReLU…
Cuda fails: 4Aborted (core dumped)

when my codes run in PReLU function , it can printf “enter GPU PReLU…” for many times and then show “Cuda fails: 4Aborted (core dumped)”.

I do not where is my error about cuda codes. Can you check my code and give me some advise ?

Thanks!!!